protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("id", "dnn_ctr467_ArticleList_cboPages")), true), new TagNameFilter("option"))); if (pageList != null && pageList.Count > 0) { try { pageInt = pageList.Count; } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { viewState = this.ToolWebSite.GetAspNetViewState(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__EVENTARGUMENT", "dnn:ctr467:ArticleList:cboPages", "ScrollTop", "__dnnVariable", "__VIEWSTATE" }, new string[] { "", (i - 1).ToString(), "", "", viewState } ); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "dnn_ctr467_ArticleList_PanelA")), true), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "通知公告"; releaseTime = "20" + tr.Columns[2].ToPlainTextString().GetDateRegex("yy-MM-dd"); headName = tr.Columns[1].ToNodePlainString(); infoUrl = "http://www.szmea.net" + tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = ToolHtml.GetHtmlByUrl(SiteUrl, infoUrl, Encoding.Default).GetJsString(); //ToolHtml.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString(); } catch { } parser = new Parser(new Lexer(htldtl)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "dnn_ctr391_ArticleShow_lblContent"))); if (noList != null && noList.Count > 0) { ctxHtml = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", ""); infoCtx = ctxHtml.ToCtxString().Replace(" ", "").Replace(" ", "").Replace("\t\t", "\t").Replace("\t\t", "\t"); infoCtx = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); msgType = MsgTypeCosnt.ShenZhenJLGCMsgType; infoScorce = infoScorce.Replace(" ", ""); NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(ctxHtml)); NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgList != null && imgList.Count > 0) { for (int m = 0; m < imgList.Count; m++) { try { ImageTag img = imgList[m] as ImageTag; string src = img.GetAttribute("src"); if (src.ToLower().Contains(".gif")) { continue; } BaseAttach obj = null; if (src.Contains("http")) { obj = ToolHtml.GetBaseAttach(src, headName, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://www.szmea.net" + src.Replace("../", "/").Replace("./", "/"), headName, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag aTag = aNode[a] as ATag; if (aTag.IsAtagAttach()) { try { BaseAttach obj = null; string href = aTag.GetATagHref(); if (href.Contains("http")) { obj = ToolHtml.GetBaseAttach(href, aTag.LinkText, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://www.szmea.net" + href.Replace("../", "/").Replace("./", "/"), aTag.LinkText, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { continue; } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("a"), new HasAttributeFilter("id", "PageDataList__ctl7_LinkButton1"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.AsString(); pageInt = Convert.ToInt32(temp.GetRegexBegEnd("共", "页")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { viewState = this.ToolWebSite.GetAspNetViewState(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection( new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "head1:username", "head1:Password", "head1:rbLoginType", "Tb_keyword", "ddlNewsType", "ddlistaddnewsdate" }, new string[] { "PageDataList$_ctl" + (i + 1).ToString() + "$LinkButton1", "", viewState, "", "", "unit", "", "20", "" } ); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", " tb_list"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "通知公告"; releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); headName = tr.Columns[1].ToNodePlainString(); infoUrl = "http://www.szpark.com.cn" + tr.Columns[1].GetATagHref(); string htldtl = string.Empty; try { htldtl = ToolHtml.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString(); } catch { } parser = new Parser(new Lexer(htldtl)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "newsinfo"))); if (noList != null && noList.Count > 0) { ctxHtml = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", ""); infoCtx = ctxHtml.ToCtxString().Replace(" ", "").Replace(" ", "").Replace("\t\t", "\t").Replace("\t\t", "\t"); infoCtx = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); msgType = MsgTypeCosnt.ShenZhenFJYLMsgType; infoScorce = infoScorce.Replace(" ", ""); NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(ctxHtml)); NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgList != null && imgList.Count > 0) { for (int m = 0; m < imgList.Count; m++) { try { ImageTag img = imgList[m] as ImageTag; string src = img.GetAttribute("src"); if (src.ToLower().Contains(".gif")) { continue; } BaseAttach obj = null; if (src.Contains("http")) { obj = ToolHtml.GetBaseAttach(src, headName, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://www.szpark.com.cn" + src.Replace("../", "/").Replace("./", "/"), headName, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag aTag = aNode[a] as ATag; if (aTag.IsAtagAttach()) { try { BaseAttach obj = null; string href = aTag.GetATagHref(); if (href.Contains("http")) { obj = ToolHtml.GetBaseAttach(href, aTag.LinkText, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://www.szpark.com.cn" + href.Replace("../", "/").Replace("./", "/"), aTag.LinkText, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.Default); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "f12"))); if (pageList != null && pageList.Count > 0) { try { string temp = pageList.AsString(); pageInt = Convert.ToInt32(temp.GetRegexBegEnd("/", "页")); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl + "&page=" + i.ToString(), Encoding.Default); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("height", "32")), true), new TagNameFilter("table"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 0; j < table.RowCount - 1; j++) { if ((j + 1) % 2 == 0) { continue; } string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; infoType = "通知公告"; releaseTime = tr.Columns[2].ToPlainTextString().GetDateRegex(); headName = tr.Columns[1].ToNodePlainString(); infoUrl = "http://www.jianzhuxh.com/news/" + tr.Columns[1].GetATagValue("onclick").GetRegexBegEnd("'", "'"); string htldtl = string.Empty; try { htldtl = ToolHtml.GetHtmlByUrl(infoUrl, Encoding.Default).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("class", "text18"))); if (noList != null && noList.Count > 0) { ctxHtml = noList[0].ToHtml().Replace("<br/>", "\r\n").Replace("<BR/>", ""); infoCtx = ctxHtml.ToCtxString().Replace(" ", "").Replace(" ", "").Replace("\t\t", "\t").Replace("\t\t", "\t"); infoCtx = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); msgType = MsgTypeCosnt.ShenZhenJZYMsgType; infoScorce = infoScorce.Replace(" ", ""); NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType); sqlCount++; if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(ctxHtml)); NodeList imgList = parser.ExtractAllNodesThatMatch(new TagNameFilter("img")); if (imgList != null && imgList.Count > 0) { for (int m = 0; m < imgList.Count; m++) { try { ImageTag img = imgList[m] as ImageTag; string src = img.GetAttribute("src"); if (src.ToLower().Contains(".gif")) { continue; } BaseAttach obj = null; if (src.Contains("http")) { obj = ToolHtml.GetBaseAttach(src, headName, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://www.jianzhuxh.com" + src.Replace("../", "/").Replace("./", "/"), headName, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } parser = new Parser(new Lexer(ctxHtml)); NodeList aNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (aNode != null && aNode.Count > 0) { for (int a = 0; a < aNode.Count; a++) { ATag aTag = aNode[a] as ATag; if (aTag.IsAtagAttach()) { try { BaseAttach obj = null; string href = aTag.GetATagHref(); if (href.Contains("http")) { obj = ToolHtml.GetBaseAttach(href, aTag.LinkText, info.Id); } else { obj = ToolHtml.GetBaseAttach("http://www.jianzhuxh.com" + href.Replace("../", "/").Replace("./", "/"), aTag.LinkText, info.Id); } if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } } } } } } } return(null); }