protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "scott")), true), new TagNameFilter("a"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList[pageList.Count - 1].GetATagValue().Replace("(", "kdxx").Replace(")", "xxdk").GetRegexBegEnd("kdxx", "xxdk"); pageInt = Convert.ToInt32(temp); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { string typeId = ToolHtml.GetHtmlInputValue(html, "typeId"); string boardId = ToolHtml.GetHtmlInputValue(html, "boardId"); string totalRows = ToolHtml.GetHtmlInputValue(html, "totalRows"); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "typeId", "boardId", "totalRows", "pageNO" }, new string[] { typeId, boardId, totalRows, i.ToString() }); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "lefttable"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "政策法规"; headName = tr.Columns[1].ToNodePlainString(); releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); infoUrl = tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "context_div"))); if (dtlList != null && dtlList.Count > 0) { ctxHtml = dtlList.AsHtml(); infoCtx = ctxHtml.ToCtxString(); msgType = MsgTypeCosnt.ZhongShanMsgType; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "中山市区", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(ctxHtml)); NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgList != null && imgList.Count > 0) { for (int img = 0; img < imgList.Count; img++) { ImageTag imgTag = imgList[img] as ImageTag; BaseAttach baseInfo = ToolHtml.GetBaseAttachByUrl(imgTag.GetAttribute("src"), headName, info.Id); if (baseInfo != null) { ToolDb.SaveEntity(baseInfo, string.Empty); } } } parser = new Parser(new Lexer(ctxHtml)); NodeList attachList = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (attachList != null && attachList.Count > 0) { for (int a = 0; a < attachList.Count; a++) { ATag aTag = attachList[a] as ATag; if (aTag.IsAtagAttach()) { try { BaseAttach obj = ToolHtml.GetBaseAttachByUrl(aTag.Link, aTag.LinkText, info.Id); if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "AspNetPager1")), true), new TagNameFilter("a"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[pageNode.Count - 1].GetATagHref().GetRegexBegEnd(",'", "'"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTTARGET", "__EVENTARGUMENT", "__EVENTVALIDATION", "TBKey", "AspNetPager1_input" }, new string[] { viewState, "E997B95C", "AspNetPager1", i.ToString(), eventValidation, "", (i - 1).ToString() }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GridView1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); headName = aTag.LinkText; infoType = "通知公告"; releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); infoUrl = "http://www.sdzb.gov.cn/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "96%"))); if (dtlNode != null && dtlNode.Count > 0) { ctxHtml = dtlNode.AsHtml(); infoCtx = ctxHtml.ToCtxString(); msgType = "山东省建设工程招标投标管理办公室"; List <string> attach = new List <string>(); parser = new Parser(new Lexer(ctxHtml)); NodeList imgNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgNode != null && imgNode.Count > 0) { for (int p = 0; p < imgNode.Count; p++) { ImageTag img = imgNode[p] as ImageTag; string link = "http://www.sdzb.gov.cn" + img.ImageURL.GetReplace("../,./"); ctxHtml = ctxHtml.GetReplace(img.ImageURL, link); attach.Add(link); } } NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "山东省", "山东省及地市", "", infoCtx, infoType); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate)) { if (attach.Count > 0) { for (int a = 0; a < attach.Count; a++) { try { BaseAttach entity = ToolHtml.GetBaseAttachByUrl(attach[a], headName, info.Id); if (entity != null) { ToolDb.SaveEntity(entity, "SourceID,AttachServerPath"); } } catch { } } } parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.Link.ToLower().Contains("download") || a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.sdzb.gov.cn" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } try { BaseAttach entity = ToolHtml.GetBaseAttachByUrl(link, a.LinkText, info.Id); if (entity != null) { ToolDb.SaveEntity(entity, "SourceID,AttachServerPath"); } } catch { } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookieStr = string.Empty; int sqlCount = 0; int pageInt = 1; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "cn6"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().Replace("(", "kdxx").GetRegexBegEnd("kdxx", ","); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.gdzbtb.gov.cn/pbbgbd/pingbiaobaogao_" + (i - 1).ToString() + ".htm", Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "position2")), true), new TagNameFilter("li"))); if (nodeList != null && nodeList.Count > 0) { for (int j = 0; j < nodeList.Count; j++) { string bProv = string.Empty, bCity = string.Empty, bArea = string.Empty, bPrjno = string.Empty, bPrjname = string.Empty, bBidresultendtime = string.Empty, bBaseprice = string.Empty, bBiddate = string.Empty, bBuildunit = string.Empty, bBidmethod = string.Empty, bRemark = string.Empty, bInfourl = string.Empty; bPrjname = nodeList[j].GetATagValue("title"); if (bPrjname.Contains("广东省")) { bCity = "广州市区"; bPrjname = bPrjname.Replace("[", "").Replace("]-", "").Replace("]", "").Replace("广东省", ""); } else { string temp = bPrjname.Replace("[", "kdxx").Replace("]", "xxdk").GetRegexBegEnd("kdxx", "xxdk"); bPrjname = bPrjname.Replace("[", "").Replace("]-", "").Replace("]", "").Replace(temp, ""); bCity = temp + "区"; } bInfourl = "http://www.gdzbtb.gov.cn/pbbgbd/" + nodeList[j].GetATagHref().Replace("../", "").Replace("./", ""); string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(bInfourl, Encoding.Default); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("cellSpacing", "1"))); if (dtlNode != null && dtlNode.Count > 0) { string htmlTxt = dtlNode.AsHtml(); bBiddate = htmlTxt.GetDateRegex(); if (string.IsNullOrEmpty(bBiddate)) { bBiddate = DateTime.Now.ToString("yyyy-MM-dd"); } string attachUrl = string.Empty; int len1 = 0, len2 = 0; len1 = htldtl.IndexOf("$(\"#pbbg_shongti\")"); len2 = htldtl.IndexOf("</a>"); string aurl = string.Empty; string attachName = string.Empty; if (len1 > 0 && len2 > 0) { aurl = htldtl.Substring(len1, len2 - len1) + "</a>"; parser = new Parser(new Lexer(aurl)); NodeList atagNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (atagNode != null && atagNode.Count > 0) { ATag aTag = atagNode.GetATag(); attachUrl = aTag.Link; attachName = aTag.LinkText; } } if (string.IsNullOrEmpty(attachName)) { attachName = bPrjname; } BidProject info = ToolDb.GenResultProject("广东省", bCity, "", bPrjno, bPrjname, bBidresultendtime, bBaseprice, bBiddate, bBuildunit, bBidmethod, bRemark, bInfourl); if (!crawlAll && sqlCount >= this.MaxCount) { return(list); } sqlCount++; string sql = string.Format("select Id from BidProject where 1=1 and InfoUrl='{0}'", info.InfoUrl); string result = Convert.ToString(ToolDb.ExecuteScalar(sql)); if (!string.IsNullOrEmpty(result)) { if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate)) { if (!string.IsNullOrEmpty(attachUrl)) { string fileUrl = string.Empty; try { fileUrl = DateTime.Parse(bBiddate).ToString("yyyyMM"); } catch { fileUrl = DateTime.Now.ToString("yyyyMM"); } string alink = "http://www.gdzbtb.gov.cn/pbbgbd/" + fileUrl + "/" + attachUrl.Replace("../", "").Replace("./", ""); BaseAttach attach = null; try { attach = ToolHtml.GetBaseAttach(alink, attachName, info.Id, "SiteManage\\Files\\Attach\\"); if (attach == null) { attach = ToolHtml.GetBaseAttachByUrl(alink, attachName, info.Id, "SiteManage\\Files\\Attach\\"); } } catch { } if (attach != null) { string sqlDelete = string.Format("delete from BaseAttach where SourceId='{0}'", result); ToolDb.ExecuteSql(sqlDelete); ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } } } else { if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { if (!string.IsNullOrEmpty(attachUrl)) { string fileUrl = string.Empty; try { fileUrl = DateTime.Parse(bBiddate).ToString("yyyyMM"); } catch { fileUrl = DateTime.Now.ToString("yyyyMM"); } string alink = "http://www.gdzbtb.gov.cn/pbbgbd/" + fileUrl + "/" + attachUrl.Replace("../", "").Replace("./", ""); BaseAttach attach = null; try { attach = ToolHtml.GetBaseAttach(alink, attachName, info.Id, "SiteManage\\Files\\Attach\\"); if (attach == null) { attach = ToolHtml.GetBaseAttachByUrl(alink, attachName, info.Id, "SiteManage\\Files\\Attach\\"); } } catch { } if (attach != null) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <NotifyInfo>(); int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.Default); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "style1"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[0].ToNodePlainString().GetRegexBegEnd("/", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + "?page=" + i, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "99%")), true), new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%")))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { TableRow tr = (listNode[j] as TableTag).Rows[0]; string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; ATag aTag = tr.GetATag(); headName = aTag.LinkText; if (Encoding.Default.GetByteCount(headName) > 200) { headName = headName.Substring(0, 99); } infoUrl = "http://www.hnsztb.com.cn/gsgg/" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "800"))); if (dtlNode != null && dtlNode.Count > 0) { TableTag table = dtlNode[0] as TableTag; if (table.RowCount > 1) { ctxHtml = table.Rows[1].ToHtml(); } else { ctxHtml = table.ToHtml(); } infoCtx = ctxHtml.ToCtxString(); releaseTime = infoCtx.GetDateRegex(); if (string.IsNullOrEmpty(releaseTime)) { releaseTime = infoCtx.GetDateRegex("yyyy年MM月dd日"); } if (string.IsNullOrEmpty(releaseTime)) { releaseTime = infoCtx.GetDateRegex("yyyy/MM/dd"); } if (string.IsNullOrEmpty(releaseTime)) { releaseTime = infoCtx.GetChinaTime(); } msgType = "河南省建设工程招标投标协会"; infoType = "通知公告"; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "河南省", "河南省及地市", string.Empty, infoCtx, infoType); sqlCount++; if (crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate)) { parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k].GetATag(); if (a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = aTag.Link; } else { link = "http://www.hnsztb.com.cn/" + a.Link; } BaseAttach entity = null; try { entity = ToolHtml.GetBaseAttach(link, a.LinkText, info.Id); if (entity == null) { entity = ToolHtml.GetBaseAttachByUrl(link, a.LinkText, info.Id); } if (entity != null) { ToolDb.SaveEntity(entity, string.Empty); } } catch { } } } } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "pagination"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode[pageNode.Count - 1].ToNodePlainString().GetRegexBegEnd("/共", "页"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "pageindex", "X-Requested-With" }, new string[] { i.ToString(), "XMLHttpRequest" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "left_picinfo_text")), true), new TagNameFilter("li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; INode node = listNode[j]; ATag aTag = node.GetATag(); headName = aTag.LinkText; infoType = "通知公告"; releaseTime = node.ToPlainTextString().GetDateRegex("yyyy年MM月dd日"); infoUrl = "http://www.zzjs.com.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "clear"))); if (dtlNode != null && dtlNode.Count > 0) { ctxHtml = dtlNode[0].ToHtml(); infoCtx = ctxHtml.ToCtxString(); msgType = "郑州市城乡建设委员会"; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "河南省", "河南省及地市", "郑州市", infoCtx, infoType); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate)) { parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.Link.ToLower().Contains("download")) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.zzjs.com.cn" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } try { BaseAttach attach = ToolHtml.GetBaseAttachByUrl(link, a.LinkText, info.Id); if (attach != null) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } catch { } } } } } } else { Logger.Error("无内容"); } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { int count = 1, sqlCount = 1; IList list = new List <CorpWarning>(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1; string eventValidation = string.Empty; try { htl = ToolWeb.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("id", "lx"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.GetATagHref().GetRegexBegEnd("page=", "&"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { htl = ToolWeb.GetHtmlByUrl(this.SiteUrl + "&page=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(htl)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "bean"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string code = string.Empty, warningName = string.Empty, deliveryDate = string.Empty, warningType = string.Empty, punishmentType = string.Empty, prjNumber = string.Empty, totalScore = string.Empty, resultScore = string.Empty, corpType = string.Empty, publicEndDate = string.Empty, warningEndDate = string.Empty, prjName = string.Empty, badInfo = string.Empty, msgType = string.Empty, color = string.Empty; TableRow tr = table.Rows[j]; code = tr.Columns[1].ToPlainTextString().GetATag().LinkText; warningName = tr.Columns[2].ToNodePlainString(); deliveryDate = tr.Columns[3].ToPlainTextString().GetDateRegex(); warningType = tr.Columns[4].ToNodePlainString(); punishmentType = tr.Columns[5].ToNodePlainString(); string infoUrl = "http://61.144.226.2:8001/web/cxda/xzcfAction.do?method=downLoadXzcfjdRemote&xzcfjdname=" + tr.Columns[1].GetATagValue("onclick").Replace("'", "lxl").GetRegexBegEnd("lxl", "lxl"); msgType = "深圳市住房和建设局"; CorpWarning info = ToolDb.GenCorpWarning("广东省", "深圳市区", "", code, warningName, deliveryDate, warningType, punishmentType, prjNumber, totalScore, resultScore, corpType, publicEndDate, warningEndDate, prjName, badInfo, msgType, color); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(list); } if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate)) { BaseAttach attach = null; try { attach = ToolHtml.GetBaseAttachByUrl(infoUrl, code, info.Id, "SiteManage\\Files\\Attach\\"); } catch { } if (attach != null) { ToolDb.SaveEntity(attach, ""); } } count++; if (count >= 200) { count = 1; Thread.Sleep(480000); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { int sqlCount = 0; IList list = new List <BidInfo>(); string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + this.MaxCount); } catch { return(null); } int startIndex = html.IndexOf("{"); int endIndex = html.LastIndexOf("}"); html = html.Substring(startIndex, (endIndex + 1) - startIndex); JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(html); foreach (KeyValuePair <string, object> obj in smsTypeJson) { if (obj.Key == "total") { continue; } object[] array = (object[])obj.Value; foreach (object arrValue in array) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Dictionary <string, object> dic = (Dictionary <string, object>)arrValue; code = Convert.ToString(dic["bdBH"]); prjName = Convert.ToString(dic["bdName"]); beginDate = Convert.ToString(dic["fabuTime2"]); string saveUrl = Convert.ToString(dic["detailUrl"]); //if (!prjName.Contains("一片一路一街一景")) //{ // continue; //} InfoUrl = "https://www.szjsjy.com.cn:8001/jyw-lg/jyxx/queryOldOTDataDetail.do?type=4&id=" + dic["bdBH"]; List <Dictionary <string, object> > listAttachs = new List <Dictionary <string, object> >(); bool isJson = false; try { HtmlTxt = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString().GetReplace("\\t,\\r,\\n,\""); if (string.IsNullOrEmpty(HtmlTxt)) { isJson = true; string url = "https://www.szjsjy.com.cn:8001/jyw-lg/jyxx/queryZbgs.do?guid=" + dic["dbZhongBiaoJieGuoGuid"] + "&ggGuid=&bdGuid="; string htmldtl = this.ToolWebSite.GetHtmlByUrl(url); Dictionary <string, object> dtlJsons = (Dictionary <string, object>)serializer.DeserializeObject(htmldtl); buildUnit = Convert.ToString(dtlJsons["zbrAndLht"]); bidUnit = Convert.ToString(dtlJsons["tbrName"]); bidMoney = Convert.ToString(dtlJsons["zhongBiaoJE"]); try { bidMoney = (decimal.Parse(bidMoney) / 1000000).ToString(); } catch { } prjMgr = Convert.ToString(dtlJsons["xiangMuJiLi"]); Dictionary <string, object> gg = null; try { gg = dtlJsons["gg"] as Dictionary <string, object>; } catch { } Dictionary <string, object> bd = null; Dictionary <string, object> gc = null; Dictionary <string, object> xm = null; try { bd = dtlJsons["bd"] as Dictionary <string, object>; } catch { } try { gc = bd["gc"] as Dictionary <string, object>; } catch { } try { xm = bd["xm"] as Dictionary <string, object>; } catch { } string htl = this.ToolWebSite.GetHtmlByUrl(saveUrl); Parser parser = new Parser(new Lexer(htl)); NodeList nodelist = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "right_bg"))); if (nodelist != null && nodelist.Count > 0) { HtmlTxt = nodelist.AsHtml(); try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"gcBH\"></span>", "<span id=\"gcBH\">" + code + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"gcName\"></span>", "<span id=\"gcBH\">" + gc["gcName"] + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"bdName\"></span>", "<span id=\"bdName\">" + prjName + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"xmBH\"></span>", "<span id=\"xmBH\">" + xm["xm_BH"] + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"xmName\"></span>", "<span id=\"xmName\">" + xm["xm_Name"] + "</span>"); } catch { } try { long zbgsStartTime = Convert.ToInt64(dtlJsons["zbgsStartTime"]); HtmlTxt = HtmlTxt.GetReplace("<span id=\"zbgsStartTime\"></span>", "<span id=\"zbgsStartTime\">" + ToolHtml.GetDateTimeByLong(zbgsStartTime) + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"zbRName\"></span>", "<span id=\"zbRName\">" + gc["zbRName"] + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"zbdlJG\"></span>", "<span id=\"zbdlJG\">" + gc["creatorName"] + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"tbrName\"></span>", "<span id=\"tbrName\">" + dtlJsons["tbrName"] + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"zhongBiaoJE\"></span>", "<span id=\"zhongBiaoJE\">" + bidMoney + "万元</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"zhongBiaoGQ\"></span>", "<span id=\"zhongBiaoGQ\">" + dtlJsons["zhongBiaoGQ"] + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"xiangMuJiLi\"></span>", "<span id=\"xiangMuJiLi\">" + prjMgr + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"ziGeDengJi\"></span>", "<span id=\"ziGeDengJi\">" + dtlJsons["ziGeDengJi"] + "</span>"); } catch { } try { HtmlTxt = HtmlTxt.GetReplace("<span id=\"ziGeZhengShu\"></span>", "<span id=\"ziGeZhengShu\">" + dtlJsons["ziGeZhengShu"] + "</span>"); } catch { } try { string zanding = string.IsNullOrWhiteSpace(Convert.ToString(dtlJsons["isZanDingJinE"])) ? "否" : "是"; HtmlTxt = HtmlTxt.GetReplace("<span id=\"isZanDingJinE\"></span>", "<span id=\"isZanDingJinE\">" + zanding + "</span>"); } catch { } } try { string fileUrl = "https://www.szjsjy.com.cn:8001/jyw-lg/jyxx/filegroup/queryByGroupGuidZS.do?groupGuid=" + dtlJsons["ztbFileGroupGuid"]; string fileJson = this.ToolWebSite.GetHtmlByUrl(fileUrl); Dictionary <string, object> fileDic = (Dictionary <string, object>)serializer.DeserializeObject(fileJson); object[] objFile = fileDic["rows"] as object[]; foreach (object file in objFile) { Dictionary <string, object> attach = file as Dictionary <string, object>; listAttachs.Add(attach); } } catch { } } } catch { continue; } bidCtx = HtmlTxt.Replace("<br />", "\r\n").ToCtxString(); if (!isJson) { buildUnit = bidCtx.GetBuildRegex(); bidUnit = bidCtx.GetBidRegex(); bidMoney = bidCtx.GetMoneyRegex(); prjMgr = bidCtx.GetMgrRegex(); if (string.IsNullOrEmpty(prjMgr)) { prjMgr = bidCtx.GetRegex("项目负责"); } } msgType = "深圳市建设工程交易中心龙岗分中心"; specType = "建设工程"; bidType = ToolHtml.GetInviteTypes(prjName); prjName = ToolDb.GetPrjName(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳龙岗区工程", "龙岗区", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, saveUrl, prjMgr, HtmlTxt); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate, this.ExistsHtlCtx)) { if (!isJson) { Parser parser = new Parser(new Lexer(HtmlTxt)); NodeList fileNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (fileNode != null && fileNode.Count > 0) { for (int f = 0; f < fileNode.Count; f++) { ATag tag = fileNode[f] as ATag; if (tag.IsAtagAttach() || tag.Link.ToLower().Contains("downloadfile")) { try { BaseAttach attach = null; string link = string.Empty; if (tag.Link.ToLower().Contains("http")) { link = tag.Link; if (link.StartsWith("\\")) { link = link.Substring(link.IndexOf("\\"), link.Length - link.IndexOf("\\")); } if (link.EndsWith("//")) { link = link.Remove(link.LastIndexOf("//")); } link = link.GetReplace("\\", ""); } else { link = "https://www.szjsjy.com.cn:8001/" + tag.Link; } attach = ToolHtml.GetBaseAttachByUrl(link, tag.LinkText, info.Id, "SiteManage\\Files\\InviteAttach\\"); if (attach != null) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } catch { continue; } } } } } else if (listAttachs.Count > 0) { foreach (Dictionary <string, object> attach in listAttachs) { BaseAttach attachBase = null; try { string attachName = Convert.ToString(attach["attachName"]); string attachId = Convert.ToString(attach["attachGuid"]); string link = "https://www.szjsjy.com.cn:8001/file/downloadFile?fileId=" + attachId; attachBase = ToolHtml.GetBaseAttach(link, attachName, info.Id, "SiteManage\\Files\\InviteAttach\\"); if (attachBase != null) { ToolDb.SaveEntity(attachBase, "SourceID,AttachServerPath"); } } catch { } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { string html = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int pageInt = 1, sqlCount = 0; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8, ref cookiestr); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "MoreInfoList1_Pager"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString().GetRegexBegEnd("总页数:", "当前"); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__VIEWSTATE", "__EVENTTARGET", "__EVENTARGUMENT", "__EVENTVALIDATION" }, new string[] { viewState, "MoreInfoList1$Pager", i.ToString(), eventValidation }); try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "MoreInfoList1_DataGrid1"))); if (listNode != null && listNode.Count > 0) { TableTag table = listNode[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; ATag aTag = tr.Columns[1].GetATag(); infoType = "通知公告"; headName = aTag.GetAttribute("title"); releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); infoUrl = "http://www.lnzb.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "tblInfo"))); if (dtlNode != null && dtlNode.Count > 0) { ctxHtml = dtlNode.AsHtml(); infoCtx = ctxHtml.ToCtxString(); msgType = "辽宁省建设厅招标投标管理处"; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "辽宁省", "辽宁省及地市", "", infoCtx, infoType); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate)) { parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.Link.ToLower().Contains("readattachfile") || a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.lnzb.cn" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } try { BaseAttach attach = ToolHtml.GetBaseAttachByUrl(link, a.LinkText, info.Id); if (attach != null) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } catch { } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int sqlCount = 0; //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList sNode = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "ctl00_cph_context_InfoList2_GridViewPaging1_PagingDescTd"), new TagNameFilter("td"))); string pageString = sNode.AsString(); Regex regexPage = new Regex(@",共[^页]+页"); Match pageMatch = regexPage.Match(pageString); try { pageInt = int.Parse(pageMatch.Value.Replace(",共", "").Replace("页", "").Trim()); } catch (Exception) { } string cookiestr = string.Empty; for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "ctl00$ScriptManager1", "__EVENTTARGET", "__EVENTARGUMENT", "__LASTFOCUS", "__VIEWSTATE", "ctl00$cph_context$InfoList2$ddlProjectType", "ctl00$cph_context$InfoList2$ddlSearch", "ctl00$cph_context$InfoList2$txtProjectName", "ctl00$cph_context$InfoList2$GridViewPaging1$txtGridViewPagingForwardTo", "__VIEWSTATEENCRYPTED", "ctl00$cph_context$InfoList2$GridViewPaging1$btnForwardToPage" }, new string[] { "ctl00$cph_context$InfoList2$update1|ctl00$cph_context$InfoList2$GridViewPaging1$btnForwardToPage", string.Empty, string.Empty, string.Empty, viewState, string.Empty, "gcbh", string.Empty, i.ToString(), "", "GO" }); try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "ctl00_cph_context_InfoList2_GridView1"), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j] as TableRow; code = tr.Columns[1].ToPlainTextString().Trim(); prjName = tr.Columns[2].ToPlainTextString().Trim(); buildUnit = tr.Columns[3].ToPlainTextString().Trim(); beginDate = tr.Columns[5].ToPlainTextString().Trim(); endDate = tr.Columns[6].ToPlainTextString().Trim(); string InvType = tr.Columns[4].ToPlainTextString().Trim(); ATag aTag = tr.Columns[2].SearchFor(typeof(ATag), true)[0] as ATag; InfoUrl = "http://61.144.224.189:8001/LGjyzxWeb/SiteManage/" + aTag.Link.Replace("openNewWindowByMenu(\"", "").Replace("\")", ""); string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "ctl00_cph_context_lblContent"), new TagNameFilter("span"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = htmldetail.Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n"); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "ctl00_cph_context_lblContent"), new TagNameFilter("span"))); inviteCtx = dtnode.AsString().Replace("\r\r\n", "\r\n"); Regex regPrjAdd = new Regex(@"(工程地点|工程地址):[^\r\n]+[\r\n]{1}"); prjAddress = regPrjAdd.Match(inviteCtx).Value.Replace("工程地点:", "").Replace("工程地址:", "").Trim(); msgType = "深圳市建设工程交易中心龙岗分中心"; specType = "建设工程"; inviteType = ToolHtml.GetInviteTypes(InvType); Regex regOtherType = new Regex(@"(工程类型):[^\r\n]+[\r\n]{1}"); string oType = regOtherType.Match(inviteCtx).Value.Replace("工程类型:", "").Trim(); if (oType.Contains("房建")) { otherType = "房建及工业民用建筑"; } if (oType.Contains("市政")) { otherType = "市政工程"; } if (oType.Contains("园林绿化")) { otherType = "园林绿化工程"; } if (oType.Contains("装饰装修")) { otherType = "装饰装修工程"; } if (oType.Contains("电力")) { otherType = "电力工程"; } if (oType.Contains("水利")) { otherType = "水利工程"; } if (oType.Contains("环保")) { otherType = "环保工程"; } InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳龙岗区工程", "龙岗区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, string.Empty, InfoUrl, HtmlTxt); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate, this.ExistsHtlCtx)) { dtlparser.Reset(); NodeList fileNode = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (fileNode != null && fileNode.Count > 0) { for (int f = 1; f < fileNode.Count; f++) { ATag tag = fileNode[f] as ATag; if (tag.IsAtagAttach()) { try { BaseAttach attach = null; string url = "http://61.144.224.189:8001/LGjyzxWeb/" + tag.Link.Replace("../", ""); attach = ToolHtml.GetBaseAttachByUrl(url, tag.LinkText, info.Id, "SiteManage\\Files\\InviteAttach\\"); if (attach != null) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } catch { continue; } } } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int sqlCount = 0; int count = 0; //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("name", "__ec_pages"))); if (pageNode != null && pageNode.Count > 0) { SelectTag selectTag = pageNode[0] as SelectTag; pageInt = selectTag.OptionTags.Length; } string cookiestr = string.Empty; for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "ec_i", "topicChrList_20070702_crd", "topicChrList_20070702_f_a", "topicChrList_20070702_p", "topicChrList_20070702_s_name", "id", "method", "__ec_pages", "topicChrList_20070702_rd", "topicChrList_20070702_f_name", "topicChrList_20070702_f_ldate" }, new string[] { "topicChrList_20070702", "20", string.Empty, i.ToString(), string.Empty, "1660", "view", (i - 1).ToString(), "20", string.Empty, string.Empty }); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { } } parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "topicChrList_20070702_table"))); if (tdNodes != null && tdNodes.Count > 0) { TableTag table = tdNodes[0] as TableTag; for (int t = 3; t < table.RowCount; t++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[t]; prjName = tr.Columns[2].ToPlainTextString().Trim().ToRegString(); //try //{ inviteType = tr.Columns[3].ToPlainTextString().Trim(); beginDate = tr.Columns[4].ToPlainTextString().Trim(); //} //catch { DateTime beginDa = DateTime.Today; beginDate = beginDa.ToString("yyyy-MM-dd HH:mm:ss"); } ATag aTag = tr.Columns[2].SearchFor(typeof(ATag), true)[0] as ATag; Regex regexLink = new Regex(@"id=[^-]+"); InfoUrl = "http://www.szzfcg.cn/portal/documentView.do?method=view&" + regexLink.Match(aTag.Link).Value; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new TagNameFilter("body")); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n"); } catch (Exception ex) { } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("body")); inviteCtx = dtnode.AsString().Replace(" ", "").Replace("\t", "").Trim("\r\n".ToCharArray()).Replace("“", "“").Replace("”", "”").Replace("双击鼠标自动滚屏[打印此页][关闭此页]", ""); inviteCtx = System.Web.HttpUtility.HtmlDecode(inviteCtx); Regex regCtx = new Regex(@"[\r\n]+"); inviteCtx = regCtx.Replace(inviteCtx, "\r\n"); Regex regcode = new Regex(@"(招标编号|项目编号)(:|:)([0-9]|[A-Za-z]|[-])+"); code = regcode.Match(inviteCtx).Value.Replace("招标编号", "").Replace("项目编号", "").Replace(":", "").Replace(":", "").Trim(); if (string.IsNullOrEmpty(inviteCtx) || string.IsNullOrEmpty(HtmlTxt)) { parser = new Parser(new Lexer(htmldetail)); NodeFilter filter = new TagNameFilter("body"); NodeList ctxList = parser.ExtractAllNodesThatMatch(filter); inviteCtx = ctxList.AsString(); HtmlTxt = ctxList.AsHtml(); } if (string.IsNullOrEmpty(inviteCtx) || string.IsNullOrEmpty(HtmlTxt)) { Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); HtmlTxt = regexHtml.Replace(htmldetail, ""); inviteCtx = Regex.Replace(HtmlTxt, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", ""); } msgType = "深圳政府采购"; specType = "政府采购"; prjAddress = "深圳市"; if (inviteType.Contains("160")) { inviteType = ToolHtml.GetInviteTypes(prjName); } InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳政府采购", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, string.Empty, InfoUrl, HtmlTxt); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate, this.ExistsHtlCtx)) { count++; parser = new Parser(new Lexer(htmldetail)); NodeList fileNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (fileNode != null && fileNode.Count > 0) { for (int f = 0; f < fileNode.Count; f++) { ATag tag = fileNode[f] as ATag; if (tag.IsAtagAttach()) { try { BaseAttach attach = null; if (tag.Link.ToLower().Contains(".com") || tag.Link.ToLower().Contains(".cn")) { attach = ToolHtml.GetBaseAttachByUrl(tag.Link.Replace("&", "&"), tag.LinkText, info.Id, "SiteManage\\Files\\InviteAttach\\"); } else { attach = ToolHtml.GetBaseAttachByUrl("http://www.szzfcg.cn" + tag.Link.Replace("&", "&"), tag.LinkText, info.Id, "SiteManage\\Files\\InviteAttach\\"); } if (attach != null) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } catch { } } } } if (count >= 10) { count = 0; Thread.Sleep(1000 * 300); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int sqlCount = 0; string html = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl + this.MaxCount); } catch { return(null); } int startIndex = html.IndexOf("{"); int endIndex = html.LastIndexOf("}"); html = html.Substring(startIndex, (endIndex + 1) - startIndex); JavaScriptSerializer serializer = new JavaScriptSerializer(); Dictionary <string, object> smsTypeJson = (Dictionary <string, object>)serializer.DeserializeObject(html); foreach (KeyValuePair <string, object> obj in smsTypeJson) { if (obj.Key == "total") { continue; } object[] array = (object[])obj.Value; foreach (object arrValue in array) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; Dictionary <string, object> dic = (Dictionary <string, object>)arrValue; code = Convert.ToString(dic["gcBH"]); prjName = Convert.ToString(dic["gcName"]); beginDate = Convert.ToString(dic["ggStartTime2"]); string saveUrl = Convert.ToString(dic["detailUrl"]); InfoUrl = "https://www.szjsjy.com.cn:8001/jyw-lg/jyxx/queryOldOTDataDetail.do?type=1&id=" + dic["gcBH"]; try { HtmlTxt = this.ToolWebSite.GetHtmlByUrl(InfoUrl).GetJsString().GetReplace("\\t,\\r,\\n,\""); if (string.IsNullOrWhiteSpace(HtmlTxt)) { string url = "https://www.szjsjy.com.cn:8001/jyw-lg/jyxx/showGongGao.do?ggGuid=" + dic["ggGuid"]; string htmldtl = this.ToolWebSite.GetHtmlByUrl(url); JavaScriptSerializer Newserializer = new JavaScriptSerializer(); Dictionary <string, object> newTypeJson = (Dictionary <string, object>)Newserializer.DeserializeObject(htmldtl); HtmlTxt = Convert.ToString(newTypeJson["html"]); } } catch (Exception ex) { continue; } inviteCtx = HtmlTxt.Replace("</span>", "\r\n").Replace("<br />", "\r\n").ToCtxString(); prjAddress = inviteCtx.GetAddressRegex(); buildUnit = inviteCtx.GetBuildRegex(); if (string.IsNullOrEmpty(code)) { code = inviteCtx.GetCodeRegex(); } msgType = "深圳市建设工程交易中心龙岗分中心"; specType = "建设工程"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳龙岗区工程", "龙岗区", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, string.Empty, saveUrl, HtmlTxt); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate, this.ExistsHtlCtx)) { Parser parser = new Parser(new Lexer(HtmlTxt)); NodeList fileNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (fileNode != null && fileNode.Count > 0) { for (int f = 0; f < fileNode.Count; f++) { ATag tag = fileNode[f] as ATag; if (tag.IsAtagAttach() || tag.Link.ToLower().Contains("downloadfile")) { try { BaseAttach attach = null; string link = string.Empty; if (tag.Link.ToLower().Contains("http")) { link = tag.Link; if (link.StartsWith("\\")) { link = link.Substring(link.IndexOf("\\"), link.Length - link.IndexOf("\\")); } if (link.EndsWith("//")) { link = link.Remove(link.LastIndexOf("//")); } link = link.GetReplace("\\", ""); } else { link = "https://www.szjsjy.com.cn:8001/" + tag.Link; } attach = ToolHtml.GetBaseAttachByUrl(link, tag.LinkText, info.Id, "SiteManage\\Files\\InviteAttach\\"); if (attach != null) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } catch { continue; } } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_cph_context_GridViewPaingTwo1_lblGridViewPagingDesc"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.AsString(); pageInt = Convert.ToInt32(temp.GetRegexBegEnd("共", "页")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__VIEWSTATE", "__EVENTVALIDATION", "ctl00$cph_context$GridViewPaingTwo1$txtGridViewPagingForwardTo", "ctl00$cph_context$GridViewPaingTwo1$btnForwardToPage" }, new string[] { viewState, eventValidation, i.ToString(), "GO" } ); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_GridView1"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; headName = tr.Columns[1].ToNodePlainString(); releaseTime = tr.Columns[2].ToNodePlainString(); infoType = "政策法规"; infoUrl = "http://www.dgzb.com.cn/DGJYWEB/SiteManage/" + tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = ToolHtml.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString(); } catch { } parser = new Parser(new Lexer(htldtl)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_cph_context_span_MetContent"))); if (noList != null && noList.Count > 0) { ctxHtml = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", ""); infoCtx = noList.AsString().Replace(" ", "").Replace(" ", "").Replace("\t\t", "\t").Replace("\t\t", "\t"); infoCtx = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); msgType = MsgTypeCosnt.DongGuanMsgType; infoScorce = infoScorce.Replace(" ", ""); NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "东莞市区", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(htldtl)); NodeList attachList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_DownLoadFiles1_GridView1"))); if (attachList != null && attachList.Count > 0) { TableTag tabTag = attachList[0] as TableTag; for (int k = 1; k < tabTag.RowCount; k++) { TableRow dr = tabTag.Rows[k]; try { string attName = string.IsNullOrEmpty(dr.Columns[1].ToNodePlainString()) ? headName : dr.Columns[1].ToNodePlainString(); BaseAttach baseInfo = ToolHtml.GetBaseAttachByUrl("http://www.dgzb.com.cn/DGJYWEB/SiteManage/" + dr.Columns[1].GetATagHref(), attName, info.Id); if (baseInfo != null) { ToolDb.SaveEntity(baseInfo, string.Empty); } } catch { } } } } } } } } } return(null); }
private void SaveAttach(BidProject info, string htmltxt, string result, bool isUpdate) { List <BaseAttach> list = new List <BaseAttach>(); if (htmltxt.Contains("http")) { Parser parser = new Parser(new Lexer(htmltxt)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int j = 0; j < aNode.Count; j++) { ATag aTag = aNode[j].GetATag(); string attachName = aTag.LinkText; string aurl = string.Empty; if (!aTag.Link.ToLower().Contains("http")) { aurl = "https://www.szjsjy.com.cn:8001/" + aTag.Link.GetReplace("\\"); } else { aurl = aTag.Link.GetReplace("\\"); } if (string.IsNullOrWhiteSpace(attachName)) { attachName = info.PrjName; } try { string url = System.Web.HttpUtility.UrlDecode(aurl); string[] urls = url.Split('&'); url = urls[0] + "&" + urls[2] + "&" + urls[1]; BaseAttach entity = null; if (isUpdate) { entity = ToolHtml.GetBaseAttach(url.Replace("\"", ""), attachName, result, "SiteManage\\Files\\Attach\\"); } else { entity = ToolHtml.GetBaseAttach(url.Replace("\"", ""), attachName, info.Id, "SiteManage\\Files\\Attach\\"); } if (entity != null) { list.Add(entity); } } catch { } } } } else { System.Data.DataTable dtlDtl = ToolHtml.JsonToDataTable(htmltxt); if (dtlDtl != null && dtlDtl.Rows.Count > 0) { for (int i = 0; i < dtlDtl.Rows.Count; i++) { System.Data.DataRow row = dtlDtl.Rows[i]; string attachName = Convert.ToString(row["attachName"]); if (string.IsNullOrWhiteSpace(attachName)) { attachName = info.PrjName; } string attachGuid = Convert.ToString(row["attachGuid"]); string url = "https://www.szjsjy.com.cn:8001/file/downloadFile?fileId=" + attachGuid; try { BaseAttach entity = null; if (isUpdate) { entity = ToolHtml.GetBaseAttachByUrl(url, attachName, result, "SiteManage\\Files\\Attach\\"); } else { entity = ToolHtml.GetBaseAttachByUrl(url, attachName, info.Id, "SiteManage\\Files\\Attach\\"); } if (entity != null) { list.Add(entity); } } catch { } } } } if (list.Count > 0) { if (isUpdate) { string delSql = string.Format("delete from BaseAttach where SourceID='{0}'", result); ToolFile.Delete(result); int count = ToolDb.ExecuteSql(delSql); } foreach (BaseAttach attach in list) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } }
protected override IList ExecuteCrawl(bool crawlAll) { int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.SiteUrl, Encoding.UTF8); } catch { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "huifont"))); if (pageNode != null && pageNode.Count > 0) { try { string temp = pageNode.AsString(); temp = temp.Substring(temp.IndexOf("/") + 1, temp.Length - temp.IndexOf("/") - 1); pageInt = int.Parse(temp); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl("http://www.hebggzy.cn/024/024002/" + i + ".html", Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList listNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("li"), new HasAttributeFilter("class", "right-text-li"))); if (listNode != null && listNode.Count > 0) { for (int j = 0; j < listNode.Count; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; INode node = listNode[j]; ATag aTag = node.GetATag(); infoType = "通知公告"; headName = aTag.GetAttribute("title"); releaseTime = node.ToPlainTextString().GetDateRegex(); infoUrl = "http://www.hebggzy.cn" + aTag.Link; string htmldtl = string.Empty; try { htmldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htmldtl)); NodeList dtlNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "article-main"))); if (dtlNode != null && dtlNode.Count > 0) { ctxHtml = dtlNode.AsHtml(); infoCtx = ctxHtml.ToCtxString(); msgType = "河北省公共资源交易中心"; NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "河北省", "河北省及地市", "", infoCtx, infoType); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate)) { parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int k = 0; k < aNode.Count; k++) { ATag a = aNode[k] as ATag; if (a.Link.ToLower().Contains("download") || a.IsAtagAttach()) { string link = string.Empty; if (a.Link.ToLower().Contains("http")) { link = a.Link; } else { link = "http://www.hebggzy.cn/" + a.Link.GetReplace("../,./"); } if (Encoding.Default.GetByteCount(link) > 500) { continue; } try { BaseAttach attach = ToolHtml.GetBaseAttachByUrl(link, a.LinkText, info.Id); if (attach != null) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } catch { } } } } } } } } } return(null); }