protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("vAlign", "bottom"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString(); Regex reg = new Regex(@",共[^页]+页"); string page = reg.Match(temp).Value.Replace(",共", "").Replace("页", ""); pageInt = Convert.ToInt32(page); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.ydszj.gov.cn/channel/138006/content?pageNo=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("style", "height:350px;"))), new HasAttributeFilter("li"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 0; j < nodeList.Count; j++) { string prjType = nodeList[j].ToPlainTextString(); if (prjType.Contains("中标") || prjType.Contains("结果")) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = nodeList[j].ToPlainTextString().Replace(nodeList[j].ToPlainTextString().GetDateRegex(), ""); beginDate = nodeList[j].ToPlainTextString().GetDateRegex(); bidType = prjName.GetInviteBidType(); InfoUrl = "http://www.ydszj.gov.cn" + nodeList[j].GetATagHref(); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); htlDtl = htlDtl.GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "news_content"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.ToHtml(); bidCtx = HtmlTxt.ToCtxString(); bidUnit = bidCtx.GetBidRegex(); buildUnit = bidCtx.GetBuildRegex(); prjAddress = bidCtx.GetAddressRegex(); code = bidCtx.GetCodeRegex(); bidMoney = bidCtx.GetMoneyRegex(); msgType = "英德市住房和城乡建设局"; specType = "建设工程"; BidInfo info = ToolDb.GenBidInfo("广东省", "清远市区", "英德市", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } else { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = nodeList[j].ToPlainTextString().Replace(nodeList[j].ToPlainTextString().GetDateRegex(), ""); beginDate = nodeList[j].ToPlainTextString().GetDateRegex(); inviteType = prjName.GetInviteBidType(); InfoUrl = "http://www.ydszj.gov.cn" + nodeList[j].GetATagHref(); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); htlDtl = htlDtl.GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "news_content"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.ToHtml(); inviteCtx = HtmlTxt.ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); code = inviteCtx.GetCodeRegex(); prjAddress = inviteCtx.GetAddressRegex(); msgType = "英德市住房和城乡建设局"; specType = "建设工程"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "清远市区", "英德市", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default).Replace(" ", ""); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "Body_div")), true), new TagNameFilter("li"))); if (sNode != null && sNode.Count > 0) { for (int t = 0; t < sNode.Count; t++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; INode node = sNode[t]; ATag aTag = node.GetATag(); prjName = aTag.GetAttribute("title"); beginDate = node.ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.tyjzsc.com.cn/" + aTag.Link.GetReplace("./"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("style", "width:650px;"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); code = inviteCtx.GetCodeRegex(); msgType = "太原市建设工程交易中心"; specType = "建设工程"; inviteType = prjName.GetInviteBidType(); InviteInfo info = ToolDb.GenInviteInfo("山西省", "山西省及地市", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); //parser = new Parser(new Lexer(HtmlTxt)); //NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); //if (aNode != null && aNode.Count > 0) //{ // for (int k = 0; k < aNode.Count; k++) // { // ATag a = aNode[k] as ATag; // if (a.IsAtagAttach()) // { // BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, a.Link); // base.AttachList.Add(attach); // } // } //} } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string cookiestr = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "pager")), true), new TagNameFilter("a"))); if (pageNode != null && pageNode.Count > 0) { try { Regex reg = new Regex(@"[0-9]+"); string temp = reg.Match(pageNode[pageNode.Count - 1].GetATagHref().Replace("'", "")).Value; pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION", "hsa1$DD_LX", "hsa1$wd", "pager_input" }, new string[] { viewState, "pager", i.ToString(), "", eventValidation, "综合搜索", "", (i - 1).ToString() } ); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GV1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, area = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[0].ToNodePlainString(); if (prjName.Contains("[")) { area = prjName.Replace("[", "【").Replace("]", "】").GetRegexBegEnd("【", "】"); prjName = prjName.Replace("[" + area + "]", ""); } beginDate = tr.Columns[1].ToPlainTextString(); InfoUrl = "http://www.ycsggzy.cn/" + tr.Columns[0].GetATagHref().Replace("&", "&"); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "Lb_nr"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml().GetReplace("<br>", "<br />"); inviteCtx = HtmlTxt.ToCtxString(); code = inviteCtx.GetCodeRegex(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex().Replace(" ", ""); inviteType = prjName.GetInviteBidType(); specType = "建设工程"; msgType = "银川市公共资源交易中心"; InviteInfo info = ToolDb.GenInviteInfo("宁夏回族自治区", "宁夏回族自治区及地市", "银川市", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
public void DealHtml(IList list, string html, bool crawlAll) { Parser parserDtl = new Parser(new Lexer(html)); NodeList aNodes = parserDtl.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable"))); if (aNodes != null && aNodes.Count > 0) { Type typs = typeof(ATag); TableTag table = aNodes[0] as TableTag; for (int t = 1; t < table.RowCount - 1; t++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, ctx = string.Empty, CreateTime = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[t] as TableRow; ATag aTag = tr.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = aTag.Link; prjName = table.Rows[t].Columns[1].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); endDate = table.Rows[t].Columns[2].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); string htmlDtl = string.Empty; try { htmlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); } catch (Exception ex) { continue; } Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmlDtl = regexHtml.Replace(htmlDtl, ""); Parser parserCtx = new Parser(new Lexer(htmlDtl)); NodeList ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "printTb lefttable"))); if (ctxNode != null && ctxNode.Count > 0) { Parser parserdiv = new Parser(new Lexer(htmlDtl)); NodeList aNodesdiv = parserdiv.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "biuuu_button"))); HtmlTxt = ctxNode.AsHtml().Replace(aNodesdiv.AsHtml(), "").Trim(); Type tp = typeof(ATag); TableTag tabTag = ctxNode[0] as TableTag; string startTime = tabTag.Rows[1].Columns[0].ToPlainTextString().Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regex = new Regex(@"时间:\d{4}-\d{1,2}-\d{1,2} \d{1,2}:\d{1,2}:\d{1,2}"); Match math = regex.Match(startTime); beginDate = math.Value.Replace("时间:", ""); Regex regexcode = new Regex("(工程编号|项目编号|招标编号):[^\r\n]+[\r\n]{1}"); Match match = regexcode.Match(tabTag.ToPlainTextString()); code = match.Value.Substring(match.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regexBuildUnit = new Regex("(招标人|建设单位|招标采购单位):[^\r\n]+[\r\n]{1}"); Match matchBuildUnit = regexBuildUnit.Match(tabTag.ToPlainTextString()); buildUnit = matchBuildUnit.Value.Substring(matchBuildUnit.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); Regex regexAddress = new Regex("(建设地点|项目地点|工程地点):[^\r\n]+[\r\n]{1}"); Match matchAddress = regexAddress.Match(tabTag.ToPlainTextString()); prjAddress = matchAddress.Value.Substring(matchAddress.Value.IndexOf(":") + 1).Replace("\r\n", "").Replace("\t", "").Replace(" ", " ").Trim(); ctx = tabTag.Rows[2].Columns[0].ToPlainTextString().Replace(" ", " ").Replace("\r\n\r\n\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); if (ctx.Length > 0) { Regex regexCtx = new Regex("<!--[^<]+-->"); ctx = regexCtx.Replace(ctx, ""); } if (Encoding.Default.GetByteCount(code) > 50) { code = ""; } if (buildUnit == "" || buildUnit == null) { buildUnit = ""; } if (Encoding.Default.GetByteCount(buildUnit) > 150) { buildUnit = buildUnit.Substring(0, 150); } if (Encoding.Default.GetByteCount(prjAddress) > 200) { prjAddress = "见招标公告内容"; } if (beginDate.Length > 0 && endDate.Length > 0) { DateTime begin = new DateTime(); DateTime end = new DateTime(); try { begin = DateTime.Parse(beginDate); end = DateTime.Parse(endDate); } catch (Exception) { } if (begin > end) { endDate = string.Empty; } } } parserCtx.Reset(); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "toptd_bai"))); Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(ctxNode.AsString()).Value.Trim(); if (beginDate == "") { beginDate = string.Empty; } inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "惠州市区", "惠阳区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, ctx, remark, "惠州市建设工程交易中心", inviteType, "建设工程", string.Empty, InfoUrl, HtmlTxt); list.Add(info); ctxNode = parserCtx.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("target", "_blank"))); NodeList aTagNodes = ctxNode.SearchFor(typeof(ATag), true); for (int a = 0; a < aTagNodes.Count; a++) { ATag fileTage = aTagNodes[a] as ATag; if (fileTage.Link.Contains("http://www.ebc.huizhou.gov.cn/index/loadNewsFile")) { string downloadURL = fileTage.Link; BaseAttach attach = ToolDb.GenBaseAttach(fileTage.ToPlainTextString(), info.Id, downloadURL); base.AttachList.Add(attach); } } if (!crawlAll && list.Count >= this.MaxCount) { return; } } } }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_ContentPlaceHolder2_lblSumPage"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToNodePlainString(); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION" }, new string[] { "ctl00$ContentPlaceHolder2$lnkBtnNext", "", viewState, "96852609", eventValidation }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "695"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount - 1; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.GetATag(); prjName = aTag.LinkText.Trim(); beginDate = tr.Columns[1].ToPlainTextString().GetDateRegex(); InfoUrl = "http://js.panyu.gov.cn/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_txtContent"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml();//.Replace("<br", "\r\n<br"); inviteCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); code = inviteCtx.GetCodeRegex().GetCodeDel(); buildUnit = inviteCtx.GetBuildRegex(); if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")); } if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } prjAddress = inviteCtx.GetAddressRegex(); inviteType = prjName.GetInviteBidType(); msgType = "广州市番禺区住房和建设局"; specType = "政府采购"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "广州政府采购", "番禺区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList fileNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (fileNode != null && fileNode.Count > 0) { for (int k = 0; k < fileNode.Count; k++) { ATag fileAtag = fileNode[k].GetATag(); if (fileAtag.IsAtagAttach()) { string fileName = fileAtag.LinkText.ToNodeString().Replace(" ", ""); string fileLink = fileAtag.Link; if (!fileLink.ToLower().Contains("http")) { fileLink = "http://js.panyu.gov.cn/" + fileAtag.Link; } base.AttachList.Add(ToolDb.GenBaseAttach(fileName, info.Id, fileLink)); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "AspNetPager1"))); if (tdNodes != null && tdNodes.Count > 0) { string htlPage = tdNodes.ToHtml(); parser = new Parser(new Lexer(htlPage)); NodeFilter filer = new TagNameFilter("a"); NodeList pageList = parser.ExtractAllNodesThatMatch(filer); if (pageList != null && pageList.Count > 0) { for (int i = pageList.Count - 1; i >= 0; i--) { try { ATag aTag = pageList.SearchFor(typeof(ATag), true)[i] as ATag; string pageTemp = aTag.Link.Replace("main.aspx?flg=3&id=6&page=", ""); pageInt = int.Parse(pageTemp); break; } catch (Exception ex) { } } } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&page=" + i.ToString()), Encoding.UTF8); } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "760"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; beginDate = tr.Columns[2].ToPlainTextString().Trim(); ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; prjName = aTag.LinkText; InfoUrl = "http://www.uho.cn/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("bordercolor", "#FFFFFF"), new TagNameFilter("table"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).ToLower().Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n").Replace("<br/>", "\r\n"); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmldetail = regexHtml.Replace(htmldetail, ""); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("bordercolor", "#ffffff"), new TagNameFilter("table"))); inviteCtx = dtnode.AsString(); specType = "其他"; msgType = "深圳市友和保险经纪有限公司"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "content clearfix")), true), new TagNameFilter("script"))); if (sNode != null && sNode.Count > 0) { try { string page = sNode.ToString().Replace("createPageHTML(", "").Replace(",", "kd").Replace("****", "").Replace("\n", ""); page = page.GetRegexBegEnd("Code", "kd"); pageInt = int.Parse(page); //80CGBMPRINKJACJ } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://sgjd.baoan.gov.cn/zbcg/zbgg_139207/index_" + (i - 1) + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "content clearfix")), true), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = viewList[j].GetATag(); prjName = aTag.GetAttribute("title"); beginDate = viewList[j].ToPlainTextString().GetDateRegex(); InfoUrl = aTag.Link; InfoUrl = InfoUrl.GetRegexBegEnd("./", ".html"); InfoUrl = "http://sgjd.baoan.gov.cn/zbcg/zbgg_139207/" + InfoUrl + ".html"; string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "con"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); inviteType = prjName.GetInviteBidType(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); if (prjAddress.Contains("电话")) { prjAddress = prjAddress.Remove(prjAddress.IndexOf("电话")); } code = inviteCtx.GetCodeRegex().GetCodeDel(); if (code.Contains("》")) { code = code.Remove(code.IndexOf("》")); } if (code.Contains("现对本采购项目的招标事宜公告如下")) { code = code.Replace("现对本采购项目的招标事宜公告如下", ""); } specType = "建设工程"; inviteType = "小型工程"; msgType = "深圳市宝安区松岗街道办事处"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市宝安区松岗街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("select")); if (sNode != null && sNode.Count > 0) { try { SelectTag selTag = sNode[0] as SelectTag; string temp = selTag.OptionTags[selTag.OptionTags.Length - 1].ToNodePlainString(); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + i, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "taxis_border"))); if (viewList != null && viewList.Count > 0) { TableTag table = viewList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; string prjName = string.Empty, InfoUrl = string.Empty, beginDate = string.Empty, HtmlTxt = string.Empty; ATag aTag = tr.GetATag(); if (aTag == null) { continue; } prjName = aTag.LinkText.Trim(); InfoUrl = "http://www.hzhlz.gov.cn/" + aTag.Link; beginDate = tr.Columns[3].ToPlainTextString().GetDateRegex(); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "newsContent"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); string ctx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); parser.Reset(); NodeList nodeName = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "font1"))); if (nodeName != null && nodeName.Count > 0) { prjName = nodeName[0].ToNodePlainString().GetReplace(" ").Trim(); } else { continue; } if (prjName.Contains("中标") || prjName.Contains("成交") || prjName.Contains("结果")) { string buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty; bidCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); code = bidCtx.GetCodeRegex().GetCodeDel(); buildUnit = bidCtx.GetBuildRegex(); if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetRegex("中标候选公司"); } bidMoney = bidCtx.GetMoneyRegex(); try { if (decimal.Parse(bidMoney) > 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } msgType = "广东省惠州市惠城区横沥镇人民政府"; specType = "政府采购"; bidType = prjName.GetInviteBidType(); BidInfo info = ToolDb.GenBidInfo("广东省", "惠州市区", "惠城区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.hzhlz.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } else { string code = string.Empty, buildUnit = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty; inviteCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); inviteType = prjName.GetInviteBidType(); code = inviteCtx.GetCodeRegex().GetCodeDel(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } msgType = "广东省惠州市惠城区横沥镇人民政府"; specType = "政府采购"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "惠州市区", "惠城区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.hzhlz.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); // NodeList pageNode = parser.ExtractAllNodesThatMatch((new TagNameFilter("div"))); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "ewb-page-items clearfix"))); if (pageNode != null && pageNode.Count > 0) { string sa = pageNode.AsHtml(); string dd = sa.Replace("</p>", "\r\n").ToCtxString(); string temp = dd.GetRegexBegEnd("1/", "\r").GetReplace("\r\n", ""); pageInt = int.Parse(temp); } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.hyggzy.gov.cn/zfcg/" + i + ".html"); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("li"), new HasAttributeFilter("class", "ewb-com-item clearfix"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 0; j < nodeList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = nodeList[j].GetATag(); prjName = aTag.GetAttribute("title"); InfoUrl = "http://www.hyggzy.gov.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "ewb-list-bd"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.Replace("</p>", "\r\n").ToCtxString(); prjAddress = inviteCtx.GetRegex("公开摇珠地点"); beginDate = inviteCtx.GetRegex("发布时间"); beginDate = beginDate.GetDateRegex(); buildUnit = inviteCtx.GetBuildRegex(); if (string.IsNullOrWhiteSpace(prjAddress)) { prjAddress = inviteCtx.GetAddressRegex(); } prjAddress = inviteCtx.GetRegex(new string[] { "开标地点" }).GetCodeDel(); if (string.IsNullOrWhiteSpace(prjAddress)) { prjAddress = inviteCtx.GetRegex("投标及开标地点"); } if (string.IsNullOrWhiteSpace(buildUnit)) { buildUnit = inviteCtx.GetRegex("项目单位"); } if (string.IsNullOrWhiteSpace(endDate)) { endDate = inviteCtx.GetRegexBegEnd("至", "止"); } endDate = endDate.GetReplace("年", "-").GetReplace("月", "-").GetReplace("日", ""); code = inviteCtx.GetCodeRegex(new string[] { "项目编号" }).GetCodeDel(); if (string.IsNullOrWhiteSpace(code)) { code = inviteCtx.GetRegexBegEnd("编号:", "采购").GetReplace("\r\n", ""); } if (string.IsNullOrWhiteSpace(code)) { code = inviteCtx.GetRegexBegEnd("项目(", ")").GetReplace("\r\n", ""); } msgType = "河源市公共资源交易中心"; specType = "政府采购"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "河源市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.hyggzy.gov.cn" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "menu"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().ToNodeString().Replace("条共", ""); Regex reg = new Regex(@"条[^页]+页"); string page = reg.Match(temp).Value.Replace("条", "").Replace("页", ""); pageInt = Convert.ToInt32(page); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&page=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "90%"))); if (nodeList != null && nodeList.Count > 1) { TableTag table = nodeList[nodeList.Count - 2] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; string prjType = tr.Columns[1].ToNodePlainString(); if (prjType.Contains("中标") || prjType.Contains("结果")) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = prjType; beginDate = tr.Columns[3].ToNodePlainString(); bidType = prjName.GetInviteBidType(); InfoUrl = "http://net.suixi.gov.cn/com/ghjsj/" + tr.Columns[1].GetATagHref(); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); htlDtl = htlDtl.GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "92%"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.ToHtml(); bidCtx = HtmlTxt.ToCtxString(); bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.ToNodeString().GetRegexBegEnd("评标委员会推荐", "为中标候选"); } bidMoney = bidCtx.GetMoneyRegex(); if (string.IsNullOrEmpty(bidMoney) || bidMoney == "0") { bidMoney = bidCtx.ToNodeString().GetRegexBegEnd("中标价", ","); } prjAddress = bidCtx.GetAddressRegex(); code = bidCtx.GetCodeRegex(); buildUnit = bidCtx.GetBuildRegex(); msgType = "遂溪住房和城乡规划建设局"; specType = "建设工程"; BidInfo info = ToolDb.GenBidInfo("广东省", "湛江市区", "遂溪县", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } else { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = prjType; beginDate = tr.Columns[3].ToNodePlainString(); inviteType = prjName.GetInviteBidType(); InfoUrl = "http://net.suixi.gov.cn/com/ghjsj/" + tr.Columns[1].GetATagHref(); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); htlDtl = htlDtl.GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "92%"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.ToHtml(); inviteCtx = HtmlTxt.ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); code = inviteCtx.GetCodeRegex(); prjAddress = inviteCtx.GetAddressRegex(); msgType = "遂溪住房和城乡规划建设局"; specType = "建设工程"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "湛江市区", "遂溪县", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("script"), new HasAttributeFilter("type", "text/javascript"))); string b = pageNode.AsString().GetCtxBr(); string c = b.Replace("('", "徐鑫").Replace("')", "凯德"); if (pageNode != null && pageNode.Count > 0) { try { string temp = c.GetRegexBegEnd("徐鑫", "凯德"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i >= 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "fcInfotitle", "currentPage" }, new string[] { "", i.ToString() } ); try { string a1 = nvc[0]; html = this.ToolWebSite.GetHtmlByUrl("https://www.dgzb.com.cn/ggzy/website/WebPagesManagement/TradeInfo/GovProcurement/findListByPage?fcInfotype=1&openbidbelong=ZJ", nvc, Encoding.UTF8); } catch { continue; } } JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(html); foreach (KeyValuePair <string, object> obj in smsTypeJson) { object[] array = (object[])obj.Value; foreach (object arrValue in array) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Dictionary <string, object> dic = (Dictionary <string, object>)arrValue; code = Convert.ToString(dic["fcTendersn"]); prjName = Convert.ToString(dic["fcInfotitle"]); beginDate = Convert.ToString(dic["fcInfostartdate"]).GetDateRegex("yyyy-MM-dd"); string xu = Convert.ToString(dic["publishinfoid"]); InfoUrl = "https://www.dgzb.com.cn/ggzy/website/WebPagesManagement/TradeInfo/GovProcurement/govdetail?publishinfoid=" + xu + "&fcInfotype=1"; //fcInfocontent string htmldtl = string.Empty; try { htmldtl = dic["fcInfocontent"].ToString(); // htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } bool isTable = true; parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("align", "center"))); if (dtlNode == null || dtlNode.Count < 1) { isTable = false; parser = new Parser(new Lexer(htmldtl)); dtlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("body")); } if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.ToHtml(); if (isTable) { TableTag dtlTable = dtlNode[0] as TableTag; for (int d = 0; d < dtlTable.RowCount; d++) { try { inviteCtx += dtlTable.Rows[d].Columns[0].ToPlainTextString().Replace(":", "").Replace(":", "") + ":"; inviteCtx += dtlTable.Rows[d].Columns[1].ToPlainTextString() + "\r\n"; } catch { } } if (string.IsNullOrEmpty(inviteCtx)) { inviteCtx = HtmlTxt.ToCtxString(); } } else { // inviteCtx = HtmlTxt.ToCtxString(); inviteCtx = HtmlTxt.Replace("</p>", "\r\n").ToCtxString(); } prjAddress = inviteCtx.GetRegexBegEnd("地点", "十"); if (string.IsNullOrEmpty(prjAddress)) { prjAddress = inviteCtx.GetAddressRegex(); } inviteType = prjName.GetInviteBidType(); msgType = "东莞市政府采购"; specType = "政府采购"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "东莞市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aTagNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aTagNode != null && aTagNode.Count > 0) { for (int k = 0; k < aTagNode.Count; k++) { ATag aTag = aTagNode[k].GetATag(); if (aTag.IsAtagAttach()) { string linkurl = aTag.Link; linkurl = linkurl.Replace("&", "&"); string cc = string.Empty; string aa = linkurl.GetRegexBegEnd("&", "id"); if (aa == "") { cc = linkurl; } else { cc = linkurl.Replace(aa, ""); } BaseAttach attach = ToolDb.GenBaseAttach(aTag.LinkText, info.Id, cc); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } //Parser parser = new Parser(new Lexer(html)); //NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("script"), new HasAttributeFilter("type", "text/javascript"))); //if (pageNode != null && pageNode.Count > 0) //{ // try // { // string temp = pageNode.AsString().GetRegexBegEnd("找到", "条"); // int page = int.Parse(temp); // pageInt = page / 30 + 1; // } // catch { } //} //for (int i = 1; i <= pageInt; i++) //{ // if (i > 1) // { // NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( // new string[] { "ec_i", // "topicChrList_crd", // "topicChrList_f_a", // "topicChrList_p", // "topicChrList_s_stockProjectCode", // "topicChrList_s_name", // "topicChrList_s_ldate", // "id", // "method", // "__ec_pages", // "topicChrList_rd", // "topicChrList_f_stockProjectCode", // "topicChrList_f_name", // "topicChrList_f_ldate",}, // new string[]{ // "topicChrList", // "30", // "", // i.ToString(), // "topicChrList_s_stockProjectCode", // "topicChrList_s_name", // "topicChrList_s_ldate", // "1662", // "view", // "1", // "30", // "","","" // } // ); // try // { // html = this.ToolWebSite.GetHtmlByUrl("http://dggp.dg.gov.cn/portal/topicView.do?", nvc, Encoding.UTF8); // } // catch // { // continue; // } // } // parser = new Parser(new Lexer(html)); // NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "topicChrList_table"))); // if (listNode != null && listNode.Count > 0) // { // TableTag table = listNode[0] as TableTag; // for (int j = 3; j < table.RowCount; j++) // { // TableRow tr = table.Rows[j]; // string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; // prjName = tr.Columns[1].ToNodePlainString(); // code = tr.Columns[0].ToNodePlainString(); // buildUnit = tr.Columns[2].ToNodePlainString(); // beginDate = tr.Columns[3].ToPlainTextString().GetDateRegex("yyyy年MM月dd日"); // InfoUrl = "http://dggp.dg.gov.cn/portal/documentView.do?method=view&id=" + tr.Columns[1].GetATagHref().Replace("/viewer.do?id=", ""); // string htmldtl = string.Empty; // try // { // htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); // } // catch // { // continue; // } // bool isTable = true; // parser = new Parser(new Lexer(htmldtl)); // NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "bulletinContent"))); // if (dtlNode == null || dtlNode.Count < 1) // { // isTable = false; // parser.Reset(); // dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "container"))); // } // if (dtlNode == null || dtlNode.Count < 1) // { // isTable = false; // parser = new Parser(new Lexer(htmldtl)); // dtlNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("body")); // } // if (dtlNode != null && dtlNode.Count > 0) // { // HtmlTxt = dtlNode.ToHtml(); // if (isTable) // { // TableTag dtlTable = dtlNode[0] as TableTag; // for (int d = 0; d < dtlTable.RowCount; d++) // { // try // { // inviteCtx += dtlTable.Rows[d].Columns[0].ToPlainTextString().Replace(":", "").Replace(":", "") + ":"; // inviteCtx += dtlTable.Rows[d].Columns[1].ToPlainTextString() + "\r\n"; // } // catch { } // } // } // if (string.IsNullOrEmpty(inviteCtx)) inviteCtx = HtmlTxt.ToCtxString(); // prjAddress = inviteCtx.GetAddressRegex(); // inviteType = prjName.GetInviteBidType(); // msgType = "东莞市政府采购"; // specType = "政府采购"; // InviteInfo info = ToolDb.GenInviteInfo("广东省", "东莞市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); // list.Add(info); // parser = new Parser(new Lexer(HtmlTxt)); // NodeList aTagNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); // if (aTagNode != null && aTagNode.Count > 0) // { // for (int k = 0; k < aTagNode.Count; k++) // { // ATag aTag = aTagNode[k].GetATag(); // if (aTag.IsAtagAttach()) // { // BaseAttach attach = ToolDb.GenBaseAttach(aTag.LinkText, info.Id, "http://dggp.dg.gov.cn" + aTag.Link); // base.AttachList.Add(attach); // } // } // } // if (!crawlAll && list.Count >= this.MaxCount) return list; // } // } // } //} } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); Dictionary <string, string> dic = new Dictionary <string, string>(); dic.Add("工程招标", "gczb/"); dic.Add("货物招标", "hwzb/"); dic.Add("服务招标", "fwzb/"); foreach (string key in dic.Keys) { int pageInt = 1, listCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + dic[key]); } catch { continue; } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "list_page_text"))); if (pageNode != null && pageNode.Count > 0) { string temp = pageNode.AsString(); try { pageInt = int.Parse(temp.GetRegexBegEnd("HTML", ",").Replace("(", "")); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + dic[key] + "index_" + (i - 1).ToString() + ".shtml"); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "list_1_right_list")), true), new TagNameFilter("ul"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; beginDate = listNode[j].ToPlainTextString().GetDateRegex(); ATag aTag = listNode[j].GetATag(); prjName = aTag.LinkText; InfoUrl = this.SiteUrl + dic[key] + aTag.Link.Replace("../", "").Replace("./", "").Replace(" ", ""); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "text"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); parser = new Parser(new Lexer(HtmlTxt)); NodeList namNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "text_bt"))); if (namNode != null && namNode.Count > 0) { string temp = prjName; prjName = namNode[0].ToNodePlainString(); if (!prjName.Contains(temp.Replace(".", ""))) { prjName = temp; } } inviteType = key; specType = "建设工程"; msgType = "天津市发展和改革委员会"; InviteInfo info = ToolDb.GenInviteInfo("天津市", "天津市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); listCount++; if (!crawlAll && listCount >= this.MaxCount) { goto type; } } } } } type : continue; } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "lblPageCount"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToNodePlainString(); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__LASTFOCUS", "__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION", "gcbh_Text_Box", "gcmc_TextBox", "num_TextBox", "ImageButton3.x", "ImageButton3.y" }, new string[] { "", "", "", viewState, "B0108473", eventValidation, "", "", "", "5", "12" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); if (aTag == null) { continue; } string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = aTag.LinkText.GetReplace(" "); buildUnit = tr.Columns[3].ToNodePlainString(); endDate = tr.Columns[4].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.bcactc.com/home/gcxx/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "hei_text"))); if (dtlNode != null && dtlNode.Count > 0) { TableTag dtlTable = dtlNode[0] as TableTag; HtmlTxt = dtlTable.ToHtml(); for (int r = 0; r < dtlTable.RowCount; r++) { for (int c = 0; c < dtlTable.Rows[r].ColumnCount; c++) { if ((c + 1) % 2 == 0) { inviteCtx += dtlTable.Rows[r].Columns[c].ToNodePlainString() + "\r\n"; } else { inviteCtx += dtlTable.Rows[r].Columns[c].ToNodePlainString().GetReplace(":,:") + ":"; } } } code = inviteCtx.GetCodeRegex(); prjAddress = inviteCtx.GetAddressRegex().GetCodeDel(); beginDate = inviteCtx.GetRegex("登记日期,登记时间").GetDateRegex(); msgType = "北京市建设工程发包承包交易中心"; specType = "建设工程"; inviteType = "勘察设计"; InviteInfo info = ToolDb.GenInviteInfo("北京市", "北京市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "pagination")), true), new TagNameFilter("a"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[pageNode.Count - 2].GetATag().Link.Replace("31", "").Replace(".htm", ""); pageInt = int.Parse(temp); } catch { pageInt = 32; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.topway.com.cn/6/12/default_" + (i - 1).ToString() + ".htm", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("id", "newsList")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = listNode[j].GetATagValue("title"); beginDate = listNode[j].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.topway.com.cn/6/12/" + listNode[j].GetATagHref(); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "fwRight float_l marginL_10"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml().Replace("<br>", "\r\n").Replace("<br/>", "\r\n").Replace("<br />", "\r\n"); inviteCtx = HtmlTxt.ToCtxString().Replace("“", "").Replace("“", "").Replace("”", ""); buildUnit = inviteCtx.GetBuildRegex(); code = inviteCtx.GetCodeRegex(); prjAddress = inviteCtx.GetAddressRegex(); inviteType = prjName.GetInviteBidType(); msgType = "深圳市天威视讯股份有限公司"; specType = "其他"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("align", "left"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "Title", "zbr", "xmszd", "zbfwpt", "kbrq", "zblx", "zb", "page" }, new string[] { "", "", "", "", "", "", "1", i.ToString() }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("dt"), new HasAttributeFilter("class", "ny_news")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = listNode[j].GetATag(); prjName = aTag.GetAttribute("title"); if (string.IsNullOrWhiteSpace(prjName)) { prjName = aTag.LinkText; } InfoUrl = aTag.Link.Trim(); beginDate = listNode[j].ToPlainTextString().GetDateRegex(); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "ny_wz"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); code = inviteCtx.GetCodeRegex(); if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } msgType = "海南省发展和改革委员会"; specType = "建设工程"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("海南省", "海南省及地市", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://ztb.hainan.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 3; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch { return(list); } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + i, Encoding.Default); } catch { continue; } } Parser parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("valign", "top"))), new TagNameFilter("table"))); if (viewList != null && viewList.Count > 0) { TableTag table = null; for (int l = 0; l < viewList.Count; l++) { TableTag tagTable = viewList[l] as TableTag; if (tagTable.RowCount >= 1) { TableColumn cell = tagTable.Rows[0].Columns[0]; if (cell.GetAttribute("width") == "30" && cell.GetAttribute("height") == "25") { table = tagTable; } } } if (table == null) { continue; } for (int j = 0; j < table.RowCount; j++) { string prjName = string.Empty, InfoUrl = string.Empty, beginDate = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.GetATag(); if (aTag == null) { continue; } prjName = aTag.GetAttribute("title"); beginDate = table.ToNodePlainString().GetDateRegex(); InfoUrl = "http://www.tanbu.gov.cn/" + aTag.Link; string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("width", "706"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); if (prjName.Contains("中标") || prjName.Contains("成交") || prjName.Contains("结果") || prjName.Contains("候选人公示")) { string buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty; bidCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); code = bidCtx.GetCodeRegex().GetCodeDel(); buildUnit = bidCtx.GetBuildRegex(); if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } bidUnit = bidCtx.GetBidRegex(); if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetRegex("中标候选公司,中标候选人"); } bidMoney = bidCtx.GetMoneyRegex(); if (string.IsNullOrEmpty(bidUnit)) { parser = new Parser(new Lexer(HtmlTxt)); NodeList bidNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellspacing", "1"))); if (bidNode != null && bidNode.Count > 0) { string ctx = string.Empty; TableTag bidTable = bidNode[0] as TableTag; if (bidTable.RowCount >= 2 && bidTable.Rows[0].ColumnCount >= 2) { ctx = bidTable.Rows[0].Columns[1].ToNodePlainString() + ":"; ctx += bidTable.Rows[1].Columns[1].ToNodePlainString() + "\r\n"; } bidUnit = ctx.GetBidRegex(new string[] { "候选人" }); } } try { if (decimal.Parse(bidMoney) > 100000) { bidMoney = (decimal.Parse(bidMoney) / 10000).ToString(); } } catch { } msgType = "广州市花都区炭步镇人民政府"; specType = "政府采购"; bidType = prjName.GetInviteBidType(); BidInfo info = ToolDb.GenBidInfo("广东省", "广州政府采购", "花都区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.tanbu.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } else { string code = string.Empty, buildUnit = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty; inviteCtx = HtmlTxt.ToLower().GetReplace("</p>,</br>,<br>", "\r\n").ToCtxString(); inviteType = prjName.GetInviteBidType(); code = inviteCtx.GetCodeRegex().GetCodeDel(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } msgType = "广州市花都区炭步镇人民政府"; specType = "政府采购"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "广州政府采购", "花都区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.tanbu.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1101; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(null); } for (int i = 1; i < pageInt; i++) { if (i > 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "listVo.projectName", "listVo.serviceType", "listVo.selectType", "listVo.simpleProject", "listVo.parentProject", "listVo.parentRewardProject", "listVo.rewardProject", "listVo.selfProject", "pageNumber", "sourtType" }, new string[] { "", "", "", "true", "true", "true", "true", "true", (i - 1).ToString(), "" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } Parser parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "table table-hover table-bordered table-list"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[0].GetATag(); prjName = aTag.LinkText.ToRegString(); beginDate = tr.Columns[3].ToPlainTextString().GetDateRegex(); buildUnit = tr.Columns[1].ToNodePlainString(); InfoUrl = "http://183.63.34.189" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "pannel-cont"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); parser = new Parser(new Lexer(HtmlTxt)); NodeList bidNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("table")); if (bidNode != null && bidNode.Count > 0) { string ctx = string.Empty; TableTag bidTable = bidNode[0] as TableTag; for (int r = 0; r < bidTable.RowCount; r++) { for (int c = 0; c < bidTable.Rows[r].ColumnCount; c++) { string temp = bidTable.Rows[r].Columns[c].ToNodePlainString(); if (c % 2 == 0) { ctx += temp + ":"; } else { ctx += temp + "\r\n"; } } } code = ctx.GetCodeRegex().GetCodeDel(); prjAddress = ctx.GetAddressRegex(); } inviteCtx = HtmlTxt.ToCtxString(); msgType = "惠州市中介超市建设领导小组办公室"; specType = "建设工程"; inviteType = prjName.GetInviteBidType(); InviteInfo info = ToolDb.GenInviteInfo("广东省", "惠州市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach() || a.Link.Contains("downloadfile")) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://183.63.34.189/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("id", "id1"))); if (sNode != null && sNode.Count > 0) { try { SelectTag tag = sNode[0] as SelectTag; string temp = tag.OptionTags[tag.OptionTags.Length - 1].StringText; pageInt = int.Parse(temp.GetReplace("第,页")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "&page=" + i, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellpadding", "4"))); if (viewList != null && viewList.Count > 0) { TableTag table = viewList[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; if (tr.ColumnCount == 1) { continue; } string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = tr.Columns[1].GetATag(); beginDate = tr.Columns[1].ToPlainTextString().GetDateRegex(); prjName = aTag.GetAttribute("title"); InfoUrl = "http://liaobu.dg.gov.cn/" + aTag.Link; string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmDtl = regexHtml.Replace(htmDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "cont"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); inviteType = prjName.GetInviteBidType(); prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "东莞市寮步镇政府"; specType = "政府采购"; inviteType = prjName.GetInviteBidType(); InviteInfo info = ToolDb.GenInviteInfo("广东省", "东莞市区", "寮步镇", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://liaobu.dg.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "NewsPage"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().Replace("createPageHTML(", "").Replace("index", "").Replace("html", "").Replace(", 0,", "").Replace(");", "").Replace(",", "").Replace(";", "").Replace(")", "").Replace("\"", "").Replace(" ", ""); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.shajing.gov.cn/xxgk_14947/ywxx/zbcg/zbgg/index_" + (i - 1).ToString() + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "NewsLiks01Text"))), new TagNameFilter("ul"))), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Regex regDate = new Regex(@"\d{4}-\d{1,2}-\d{1,2}"); beginDate = regDate.Match(viewList[j].ToPlainTextString().Trim()).Value; string temp = viewList[j].ToPlainTextString().Trim().Replace(beginDate, ""); try { int beg = temp.IndexOf("else{"), end = temp.Length; temp = temp.Substring(beg, end - beg); beg = temp.IndexOf("<a"); end = temp.IndexOf("/a>"); temp = temp.Substring(beg, (end - beg) + 3); beg = temp.IndexOf(">"); end = temp.IndexOf("</"); prjName = temp.Substring(beg + 1, end - beg - 1); Parser p = new Parser(new Lexer(temp)); NodeList l = p.ExtractAllNodesThatMatch(new TagNameFilter("a")); ATag aTag = l.SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://www.shajing.gov.cn/xxgk_14947/ywxx/zbcg/zbgg/" + aTag.Link.Replace("../", "").Replace("./", ""); } catch { continue; } string htlDtl = string.Empty, ctx = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8); Regex regexHtml = new Regex(@"<script[^<]*</script>|<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htlDtl = regexHtml.Replace(htlDtl, ""); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "DivContent"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = System.Text.RegularExpressions.Regex.Replace(dtl.AsHtml(), "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = System.Text.RegularExpressions.Regex.Replace(inviteCtx, "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); string InvType = prjName; if (InvType.Contains("施工")) { inviteType = "施工"; } if (InvType.Contains("监理")) { inviteType = "监理"; } if (InvType.Contains("设计")) { inviteType = "设计"; } if (InvType.Contains("勘察")) { inviteType = "勘察"; } if (InvType.Contains("服务")) { inviteType = "服务"; } if (InvType.Contains("劳务分包")) { inviteType = "劳务分包"; } if (InvType.Contains("专业分包")) { inviteType = "专业分包"; } if (InvType.Contains("小型施工")) { inviteType = "小型工程"; } if (InvType.Contains("设备材料")) { inviteType = "设备材料"; } Regex regPrjAddr = new Regex(@"(工程位置|工程地点|工程地址|详细地址|地点|地址)(:|:)[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程位置", "").Replace("工程地点", "").Replace("工程地址", "").Replace("详细地址", "").Replace("地点", "").Replace("地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regBuildUnit = new Regex(@"(招标代理机构|招标单位|招标人|招标单位(盖章))(:|:)[^\r\n]+\r\n"); buildUnit = regBuildUnit.Match(inviteCtx).Value.Replace("招标代理机构", "").Replace("招标单位", "").Replace("招标人", "").Replace("(盖章)", "").Replace(":", "").Replace(":", "").Trim(); Regex regPrjCode = new Regex(@"(工程编号|项目编号|编号)(:|:)[^\r\n]+\r\n"); code = regPrjCode.Match(inviteCtx).Value.Replace("工程编号", "").Replace("项目编号", "").Replace("编号", "").Replace(":", "").Replace(":", "").Trim(); msgType = "深圳市宝安区沙井街道办事处"; if (string.IsNullOrEmpty(prjAddress) || Encoding.Default.GetByteCount(prjAddress) > 150) { prjAddress = "见招标信息"; } code = ToolHtml.GetSubString(code, 50); buildUnit = ToolHtml.GetSubString(buildUnit, 150); specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市宝安区沙井街道办事处"; } inviteType = ToolHtml.GetInviteType(inviteType); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "宝安区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "MoreInfoList1_Pager"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToPlainTextString().GetRegexBegEnd("总页数", "当前").Replace(":", ""); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT" }, new string[] { viewState, "MoreInfoList1$Pager", i.ToString() } ); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); prjName = aTag.GetAttribute("title"); beginDate = tr.ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.gaxqjyzx.com" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); code = inviteCtx.GetCodeRegex(); buildUnit = inviteCtx.GetBuildRegex(); if (buildUnit.Contains("招标代理机构")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理机构")); } if (buildUnit.Contains("代理机构")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("代理机构")); } prjAddress = inviteCtx.GetAddressRegex(); inviteType = prjName.GetInviteBidType(); specType = "建设工程"; msgType = "贵安新区公共资源交易中心"; InviteInfo info = ToolDb.GenInviteInfo("贵州省", "贵州省及地市", "贵安新区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "p_bar"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.cqzb.gov.cn/class-5-1(" + i + ").aspx"); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "ztb_list_right")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, city = string.Empty; INode node = listNode[j]; ATag aTag = node.GetATag(); prjName = aTag.GetAttribute("title"); beginDate = node.ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.cqzb.gov.cn/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "ztb_zbxx1"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); code = inviteCtx.GetCodeRegex(); specType = "建设工程"; inviteType = prjName.GetInviteBidType(); msgType = "重庆市招标投标综合网"; InviteInfo info = ToolDb.GenInviteInfo("重庆市", "重庆市及区县", city, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.cqzb.gov.cn/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default, ref cookiestr); Regex regexHtml = new Regex(@"<script[^<]*</script>"); htl = regexHtml.Replace(htl, ""); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("align", "right"))); Regex regexPage = new Regex(@"共\d+页"); try { page = int.Parse(regexPage.Match(nodeList.AsString()).Value.Trim(new char[] { '共', '页' })); } catch (Exception) { } for (int i = 1; i < page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&otype=&pageNum=" + i.ToString()), Encoding.Default); Regex regexHtml = new Regex(@"<script[^<]*</script>"); htl = regexHtml.Replace(htl, ""); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellpadding", "1"))); if (tableNodeList != null && tableNodeList.Count > 0) { TableTag table = (TableTag)tableNodeList[0]; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; prjName = tr.Columns[0].ToPlainTextString().Trim(); beginDate = tr.Columns[1].ToPlainTextString().Replace(" ", "").Trim().Substring(0, 10); ATag aTag = tr.Columns[0].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://market.meizhou.gov.cn/website/deptwebsite/1925/Content.jsp?issueId=15488&msgType=00&filePath=" + aTag.GetAttribute("onclick").Replace("showDeptContent('1925','", ""); int ii = InfoUrl.IndexOf("'"); string oo = InfoUrl.Remove(ii).Trim(); if (oo.Contains("content.php")) { string url = aTag.GetAttribute("onclick").Replace("showDeptContent('1925','", ""); ii = url.IndexOf("'"); oo = url.Remove(ii).ToString(); } string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(oo), Encoding.Default).Replace(" ", ""); Regex regexHtml = new Regex(@"<script[^<]*</script>"); htmldetail = regexHtml.Replace(htmldetail, ""); } catch (Exception) { Logger.Error("InviteMeiZhoouJS"); continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("P"), new HasAttributeFilter("class", "MsoNormal"))); if (dtnode == null || dtnode.Count < 1) { parserdetail = new Parser(new Lexer(htmldetail)); dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "WordSection1"))); } if (dtnode.Count > 0 && dtnode != null) { HtmlTxt = dtnode.AsHtml(); for (int k = 0; k < dtnode.Count; k++) { string tr1 = string.Empty; tr1 = dtnode[k].ToPlainTextString().Replace(" ", "").Trim(); if (k == 0) { string InvType = tr1; inviteType = ToolHtml.GetInviteTypes(InvType); } inviteCtx += tr1 + ":" + "\r\n"; } Regex regPrjAddr = new Regex(@"(工程地点|建设地点):[^\r\n]+\r\n"); prjAddress = regPrjAddr.Match(inviteCtx).Value.Replace("工程地点:", "").Replace("建设地点", "").Replace(":", "").Replace(";", "").Trim(); Regex bildUnit = new Regex(@"(招标人|招标人(盖章)|招标人):[^\r\n]+[\r\n]{1}"); buildUnit = bildUnit.Match(inviteCtx).Value.Replace("招 标人:", "").Replace("招标人(盖章):", "").Replace("招标人:", "").Trim(); if (buildUnit != "" && buildUnit.Contains(":")) { int zz = buildUnit.IndexOf(":"); buildUnit = buildUnit.Remove(zz).ToString(); } Regex regcode = new Regex(@"(招标项目编号|项目编号)(:|:)[^\r\n]+[\r\n]{1}"); code = regcode.Match(inviteCtx).Value.Replace("招标项目编号", "").Replace("项目编号", "").Replace(":", "").Replace(":", "").Trim(); Regex regoType = new Regex(@"工程类型:[^\r\n]+\r\n"); string oType = regoType.Match(inviteCtx).Value.Replace("工程类型:", "").Trim(); if (oType.Contains("房建")) { otherType = "房建及工业民用建筑"; } else if (oType.Contains("市政")) { otherType = "市政工程"; } else if (oType.Contains("园林绿化")) { otherType = "园林绿化工程"; } else if (oType.Contains("装饰") || oType.Contains("装修")) { otherType = "装饰装修工程"; } else if (oType.Contains("电力")) { otherType = "电力工程"; } else if (oType.Contains("水利")) { otherType = "水利工程"; } if (oType.Contains("环保")) { otherType = "环保工程"; } if (buildUnit == "") { buildUnit = ""; } if (buildUnit.Contains("梅州市建设工程交易中心")) { buildUnit = ""; } msgType = "梅州市建设工程交易中心"; specType = "建设工程"; inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = o ns = ", "").Replace("urn:schemas-microsoft-com:office:office", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = ns0 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("<?", "").Replace("xml:namespace prefix = st1 ns = ", "").Replace("urn:schemas-microsoft-com:office:smarttags", "").Replace("/>", "").Trim(); inviteCtx = inviteCtx.Replace("?xml:namespaceprefix=o/>", "").Trim(); inviteCtx = inviteCtx.Replace("<?xml:namespaceprefix=st1/>", "").Trim(); if (Encoding.Default.GetByteCount(code) >= 50) { code = string.Empty; } if (Encoding.Default.GetByteCount(prjAddress) >= 150) { prjAddress = string.Empty; } InviteInfo info = ToolDb.GenInviteInfo("广东省", "梅州市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, oo, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("form"), new HasAttributeFilter("name", "qPageForm"))); if (pageNode != null && pageNode.Count > 0) { try { NodeList aNode = new Parser(new Lexer(pageNode.ToHtml())).ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { string temp = aNode[aNode.Count - 2].GetATagHref().Replace("turnOverPage", "").Replace("(", "").Replace(")", "").Replace(";", ""); pageInt = int.Parse(temp); } } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "channelCode", "pageIndex", "pageSize", "pointPageIndexId" }, new string[] { "0005", i.ToString(), "15", "1" }); try { html = this.ToolWebSite.GetHtmlByUrl("http://yangjiang.gdgpo.com/queryMoreInfoList.do", nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "m_m_c_list")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = listNode[j].GetATag(1); prjName = aTag.GetAttribute("title"); beginDate = listNode[j].ToPlainTextString().GetDateRegex(); InfoUrl = "http://yangjiang.gdgpo.com" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "zw_c_c_cont"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml();//.Replace("<br", "\r\n<br"); inviteCtx = HtmlTxt.ToCtxString(); code = inviteCtx.GetCodeRegex().GetCodeDel(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); inviteType = prjName.GetInviteBidType(); msgType = "阳江市政府采购"; specType = "政府采购"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "阳江市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); parser = new Parser(new Lexer(HtmlTxt)); NodeList fileNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (fileNode != null && fileNode.Count > 0) { for (int k = 0; k < fileNode.Count; k++) { ATag fileAtag = fileNode[k].GetATag(); if (fileAtag.IsAtagAttach()) { string fileName = fileAtag.LinkText.ToNodeString().Replace(" ", ""); string fileLink = fileAtag.Link; if (!fileLink.ToLower().Contains("http")) { fileLink = "http://yangjiang.gdgpo.gov.cn" + fileAtag.Link; } base.AttachList.Add(ToolDb.GenBaseAttach(fileName, info.Id, fileLink)); } } } list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "yesh fl"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode[0].ToNodePlainString().GetRegexBegEnd("/", "页"); pageInt = int.Parse(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.szlhxq.gov.cn/mzbsc/zwgk69/cgzb/zbgg282/14843-" + i.ToString() + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList viewList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "news1_list")), true), new TagNameFilter("li"))); if (viewList != null && viewList.Count > 0) { for (int j = 0; j < viewList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; beginDate = viewList[j].ToNodePlainString().GetDateRegex(); ATag aTag = viewList[j].GetATag(); prjName = aTag.GetAttribute("title"); InfoUrl = "http://www.szlhxq.gov.cn" + aTag.Link; string htmDtl = string.Empty; try { htmDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmDtl)); NodeList dtl = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "tit-content"))); if (dtl != null && dtl.Count > 0) { HtmlTxt = dtl.AsHtml(); inviteCtx = System.Text.RegularExpressions.Regex.Replace(HtmlTxt, "(<script)[\\s\\S]*?(</script>)", ""); inviteCtx = System.Text.RegularExpressions.Regex.Replace(inviteCtx.Replace("<br/>", "\r\n").Replace("<BR/>", "\r\n").Replace("<BR>", "\r\n").Replace("<br>", "\r\n"), "<[^>]*>", "").Replace(" ", "").Replace(" ", "").Replace("\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); inviteType = prjName.GetInviteBidType(); prjAddress = inviteCtx.GetAddressRegex(); if (prjAddress.Contains("**")) { prjAddress = string.Empty; } buildUnit = inviteCtx.GetBuildRegex(); if (buildUnit.Contains("资质")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("资质")); } code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "深圳市龙华新区民治街道办事处"; if (string.IsNullOrEmpty(prjAddress)) { prjAddress = "见中标信息"; } specType = "建设工程"; inviteType = "小型工程"; if (string.IsNullOrEmpty(buildUnit)) { buildUnit = "深圳市龙华新区民治街道办事处"; } InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳区及街道工程", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("li"), new HasAttributeFilter("class", "wb-page-li"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("/", "\r"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { SiteUrl = "http://www.jxsggzy.cn/web/jyxx/002006/002006001/" + i + ".html"; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr); } catch { continue; } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("li"), new HasAttributeFilter("class", "ewb-list-node clearfix"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; ATag aTag = listNode[j].GetATag(); prjName = aTag.GetAttribute("title"); if (string.IsNullOrWhiteSpace(prjName)) { prjName = aTag.LinkText; } beginDate = listNode[j].ToPlainTextString().GetDateRegex(); if (prjName[2].Equals('县') || prjName[2].Equals('区') || prjName[2].Equals('市')) { area = prjName.Substring(0, 3); } string a = aTag.Link; InfoUrl = "http://www.jxsggzy.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "article-info"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); if (buildUnit.Contains("单位章")) { buildUnit = string.Empty; } if (buildUnit.Contains("联系人")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("联系人")); } code = inviteCtx.GetCodeRegex().GetCodeDel(); inviteType = "政府采购"; specType = "政府采购"; msgType = "江西省公共资源交易中心"; InviteInfo info = ToolDb.GenInviteInfo("江西省", "江西省及地市", area, string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default, ref cookiestr); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("nowrap", "true"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数", "当前页").Replace(":", ""); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT" }, new string[] { viewState, "MoreInfoList1$Pager", i.ToString() }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.Default, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty, area = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); prjName = aTag.GetAttribute("title"); beginDate = tr.Columns[2].ToPlainTextString().GetDateRegex(); InfoUrl = "http://www.dyggzyjyzx.com" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); if (buildUnit.Contains("公司")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("公司")) + "公司"; } if (buildUnit.Contains("地址")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("地址")); } if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")); } code = inviteCtx.GetCodeRegex().GetCodeDel(); msgType = "大冶市公共资源交易中心"; specType = "建设工程"; inviteType = prjName.GetInviteBidType(); buildUnit = buildUnit.Replace(" ", ""); InviteInfo info = ToolDb.GenInviteInfo("湖北省", "湖北省及地市", "大冶市", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parser = new Parser(new Lexer(HtmlTxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.dyggzyjyzx.com/" + a.Link; } BaseAttach attach = ToolDb.GenBaseAttach(a.LinkText, info.Id, link); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
public async Task <AttendeeInvite> GetInviteAsync(InviteInfo inviteInfo) { return(await _attendeeRepository.GetInviteAsync(inviteInfo) .ConfigureAwait(continueOnCapturedContext: false)); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("width", "700"))); if (sNode != null && sNode.Count > 0) { try { string temp = sNode.AsString().ToNodeString(); Regex reg = new Regex(@"/[^下一页]+下一页"); string page = reg.Match(temp).Value.Replace("/", "").Replace("下一页", ""); pageInt = Convert.ToInt32(page); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.stjs.gov.cn/zbtb/qxzhaobiaonews.asp?page=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "705"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[nodeList.Count - 1] as TableTag; for (int j = 0; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; string prjType = tr.Columns[1].ToNodePlainString(); if (prjType.Contains("中标") || prjType.Contains("结果")) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = prjType.Replace("[中标公示]", ""); bidType = prjName.GetInviteBidType(); beginDate = tr.Columns[2].ToNodePlainString().Replace(".", "-"); InfoUrl = "http://www.stjs.gov.cn/" + tr.Columns[1].GetATagHref().Replace("../", ""); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); htlDtl = htlDtl.GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "680"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.ToHtml(); bidCtx = HtmlTxt.ToCtxString(); string ctx = string.Empty; parser = new Parser(new Lexer(HtmlTxt)); NodeList tabList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%"))); if (tabList != null && tabList.Count > 0) { TableTag tab = tabList[0] as TableTag; for (int k = 0; k < tab.RowCount; k++) { for (int d = 0; d < tab.Rows[k].ColumnCount; d++) { if (d % 2 == 0) { ctx += tab.Rows[k].Columns[d].ToPlainTextString().Replace(" ", "").Replace("\n", "").Replace("\r", "") + ":"; } else { ctx += tab.Rows[k].Columns[d].ToPlainTextString().Replace(" ", "").Replace("\n", "").Replace("\r", "") + "\r\n"; } } } bidUnit = ctx.GetBidRegex(); bidMoney = ctx.GetMoneyRegex(); buildUnit = ctx.GetBuildRegex(); prjAddress = ctx.GetAddressRegex(); code = ctx.GetCodeRegex(); } else { bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.GetMoneyRegex(); buildUnit = bidCtx.GetBuildRegex(); prjAddress = bidCtx.GetAddressRegex(); code = bidCtx.GetCodeRegex(); } msgType = "汕头建设网"; specType = "建设工程"; BidInfo info = ToolDb.GenBidInfo("广东省", "汕头市区", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } else { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = prjType.Replace("[招标公告]", ""); inviteType = prjName.GetInviteBidType(); beginDate = tr.Columns[2].ToNodePlainString().Replace(".", "-"); InfoUrl = "http://www.stjs.gov.cn/" + tr.Columns[1].GetATagHref().Replace("../", ""); string htlDtl = string.Empty; try { htlDtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); htlDtl = htlDtl.GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htlDtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "680"))); if (dtlList != null && dtlList.Count > 0) { HtmlTxt = dtlList.ToHtml(); inviteCtx = HtmlTxt.ToCtxString().Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); string ctx = string.Empty; parser = new Parser(new Lexer(HtmlTxt)); NodeList tabList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%"))); if (tabList != null && tabList.Count > 0) { TableTag tab = tabList[0] as TableTag; for (int k = 0; k < tab.RowCount; k++) { for (int d = 0; d < tab.Rows[k].ColumnCount; d++) { if (d % 2 == 0) { ctx += tab.Rows[k].Columns[d].ToPlainTextString().Replace(" ", "").Replace("\r\n", "") + ":"; } else { ctx += tab.Rows[k].Columns[d].ToPlainTextString().Replace(" ", "").Replace("\r\n", "") + "\r\n"; } } } buildUnit = ctx.GetBuildRegex(); code = ctx.GetCodeRegex().Replace("/", ""); prjAddress = ctx.GetAddressRegex(); } else { buildUnit = inviteCtx.GetBuildRegex(); code = inviteCtx.GetCodeRegex().Replace("/", ""); prjAddress = inviteCtx.GetAddressRegex(); } msgType = "汕头建设网"; specType = "建设工程"; InviteInfo info = ToolDb.GenInviteInfo("广东省", "汕头市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "wb-page-items clearfix"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToPlainTextString().GetRegexBegEnd("/", "转到"); page = int.Parse(temp); } catch { } } for (int i = 1; i <= page; i++) { if (i > 1) { try { htl = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "?pageing=" + i.ToString(), Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(htl)); NodeList node = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "r-items")), true), new TagNameFilter("li"))); if (node != null && node.Count > 0) { for (int j = 0; j < node.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = node[j].GetATag(); InfoUrl = "http://ggzy.yunfu.gov.cn" + aTag.Link; prjName = aTag.LinkText; beginDate = node[j].ToPlainTextString().GetDateRegex(); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").GetJsString(); } catch { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "mainContent"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString().Replace("“", "").Replace("”", ""); string ctx = inviteCtx; if (string.IsNullOrEmpty(inviteCtx)) { ctx = dtnode.ToHtml().Replace("</span>", "\r\n").ToCtxString().Replace("-->", "").Replace(" ", "").Replace("。", "").Replace("\r", "\r\n").Replace("\r\n\n\t\r\n\n\t", "\r\n").Replace("\r\n\n\t\r\n\n\t", "\r\n").Replace("\r\n\n\t\r\n\n\t", "\r\n").Replace("\r\n\n\t\r\n\n\t", "\r\n").Replace("\r\n\n\r\n\n", "\r\n").Replace("\r\n\n\r\n\n", "\r\n").Replace("\r\n\n\r\n\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Trim(); inviteCtx = dtnode.ToHtml().ToCtxString().Replace("-->", "").Replace(" ", "").Replace("。", "").Replace("\r", "\r\n").Replace("\r\n\n\t\r\n\n\t", "\r\n").Replace("\r\n\n\t\r\n\n\t", "\r\n").Replace("\r\n\n\t\r\n\n\t", "\r\n").Replace("\r\n\n\t\r\n\n\t", "\r\n").Replace("\r\n\n\r\n\n", "\r\n").Replace("\r\n\n\r\n\n", "\r\n").Replace("\r\n\n\r\n\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Trim(); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); inviteCtx = regexHtml.Replace(inviteCtx, "").Replace("(盖章)", "").Replace(" ", "").Trim(); ctx = regexHtml.Replace(ctx, "").Replace("(盖章)", "").Replace(" ", "").Trim(); } buildUnit = ctx.GetBuildRegex(); prjAddress = ctx.GetAddressRegex(); msgType = "云浮市建设工程交易中心"; specType = "建设工程"; if (buildUnit.Contains("招标代理机构")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理机构")).ToString().Trim(); } if (buildUnit.Contains("招标代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("招标代理")).ToString().Trim(); } buildUnit = buildUnit.Replace(":潘先生联系电话:0766--3481216", "").Trim(); parserdetail.Reset(); NodeList nameNode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("h2"), new HasAttributeFilter("class", "post-title"))); if (nameNode != null && nameNode.Count > 0) { string tempName = nameNode[0].ToNodePlainString(); if (!string.IsNullOrWhiteSpace(tempName)) { prjName = tempName; } } inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "云浮市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); parserdetail = new Parser(new Lexer(HtmlTxt)); NodeList file = parserdetail.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (file != null && file.Count > 0) { for (int d = 0; d < file.Count; d++) { ATag aFile = file.SearchFor(typeof(ATag), true)[d] as ATag; if (aFile.IsAtagAttach()) { string downloadURL = "http://ggzy.yunfu.gov.cn/" + aFile.Link; BaseAttach attach = ToolDb.GenBaseAttach(aFile.LinkText, info.Id, downloadURL); base.AttachList.Add(attach); } } } if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
internal void Update(InviteInfo model) { Update(model as InviteReference); if (model.IsRevoked != null) IsRevoked = model.IsRevoked.Value; if (model.IsTemporary != null) IsTemporary = model.IsTemporary.Value; if (model.MaxAge != null) MaxAge = model.MaxAge.Value; if (model.MaxUses != null) MaxUses = model.MaxUses.Value; if (model.Uses != null) Uses = model.Uses.Value; if (model.CreatedAt != null) CreatedAt = model.CreatedAt.Value; }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(base.SiteUrl, Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("class", "wb-page-default wb-page-number wb-page-family"))); if (nodeList != null && nodeList.Count > 0) { try { string temp = nodeList.AsString().Replace("1/", ""); page = int.Parse(temp); } catch { } } for (int i = 1; i <= page; i++) { if (i > 1) { //NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[]{ // "theID=","pageNo"}, new string[] { "76", i.ToString() }); try { //htl = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8); htl = this.ToolWebSite.GetHtmlByUrl("http://www.jysggzy.com/TPFront/jsgc/004001/?pageing=" + i, Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableNodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "ewb-data-items ewb-pt6")), true), new TagNameFilter("li"))); if (tableNodeList != null && tableNodeList.Count > 0) { for (int j = 0; j < tableNodeList.Count; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; prjName = tableNodeList[j].ToNodePlainString().Replace(" ", ""); beginDate = tableNodeList[j].ToPlainTextString().GetDateRegex(); if (!string.IsNullOrEmpty(beginDate)) { prjName = prjName.Replace(beginDate, ""); } InfoUrl = "http://www.jysggzy.com/" + tableNodeList[j].GetATagHref(); string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("id", "TDContent"))); if (dtlNode != null && dtlNode.Count > 0) { HtmlTxt = dtlNode.AsHtml(); inviteCtx = HtmlTxt.ToCtxString(); buildUnit = inviteCtx.GetBuildRegex(); prjAddress = inviteCtx.GetAddressRegex(); if (buildUnit.Contains("代理")) { buildUnit = buildUnit.Remove(buildUnit.IndexOf("代理")); } code = inviteCtx.GetCodeRegex(); if (buildUnit.Contains("联系电话")) { buildUnit = string.Empty; } msgType = "揭阳市建设工程交易中心"; inviteType = prjName.GetInviteBidType(); InviteInfo info = ToolDb.GenInviteInfo("广东省", "揭阳市区", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); //parser = new Parser(new Lexer(HtmlTxt)); //NodeList fileNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); //if (fileNode != null && fileNode.Count > 0) //{ // for (int k = 0; k < fileNode.Count; k++) // { // ATag fileAtag = fileNode[k].GetATag(); // if (fileAtag.IsAtagAttach()) // { // try // { // string link = string.Empty; // if (fileAtag.Link.Contains("gdgpo")) // link = fileAtag.Link; // else // link = "http://jieyang.gdgpo.com" + fileAtag.Link; // BaseAttach attach = ToolDb.GenBaseAttach(fileAtag.LinkText, info.Id, link); // base.AttachList.Add(attach); // } // catch { } // } // } //} if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }