/// <summary> /// 函数名称:GetPattern /// 功能说明:用于判定索引页正文是储存在Li中还是table中 /// 参数:string rawtext 去掉style 等无关标签之后的网页源码 /// 返回值 bool true表明是table型;false表明是li型 /// </summary> /// <param name="rawtext"></param> /// <returns></returns> public static bool GetPattern(string rawtext) { Lexer lexer = new Lexer(rawtext); Parser parser = new Parser(lexer); NodeFilter filter = new TagNameFilter("li");//解析出其中的li元素 NodeList htmlNodes = parser.Parse(filter); if (htmlNodes.Count == 0) { return(true);//如果源码中不含有li元素则该索引页属于table型。 } else { //去掉其中不含有时间的条目 Regex f2 = new Regex(@"\d\d:\d\d"); for (int i = htmlNodes.Count - 1; i >= 0; i--) { if (!f2.IsMatch(htmlNodes[i].ToHtml())) { htmlNodes.Remove(i); } } if (htmlNodes.Count == 0)//如果网页源码中含有li元素,但是li元素中不含有带发布时间的连接,则该索引页属于table型 { return(true); } else//否则为li型 { return(false); } } }
private string HtmlText(string sourceHtml) { hParser.Parser parser = hParser.Parser.CreateParser(sourceHtml.Replace(System.Environment.NewLine, ""), "utf-8"); StringBuilder builderHead = new StringBuilder(); StringBuilder builderBody = new StringBuilder(); hParser.NodeFilter html = new TagNameFilter("TR"); hParser.INode nodes = parser.Parse(html)[0]; builderHead.Append(nodes.Children[0].ToHtml()); hParser.INode body = nodes.Children[1]; hParser.INode div = body.Children[0]; for (int i = 0; i < div.Children.Count; i++) { if (div.Children[i] is hParser.ITag) { builderBody.Append(div.Children[i].ToHtml()); } } StringBuilder builder = new StringBuilder(); builder.Append("<html>"); builder.Append(builderHead.ToString()); builder.Append("<body>"); builder.Append(string.Format("<{0}>", div.GetText())); builder.Append(builderBody.ToString()); builder.Append("</div>"); builder.Append("</body>"); builder.Append("</html>"); return(builder.ToString()); }
public async Task <PaginatedData <TagOutput> > GetAllTagsAsync( [FromQuery] Pagination pagination, [FromQuery] TagNameFilter tagNameFilter ) { return(await _tagService.FindAllTagsAsync(tagNameFilter, pagination)); }
public static string GetTitleFromContent(string content) { content = DropComment(content); Lexer lexer = new Lexer(content); Parser parser = new Parser(lexer); NodeFilter filter = new TagNameFilter("TITLE"); NodeList list = parser.ExtractAllNodesThatMatch(filter); return(list.Count == 0 ? "" : list[0].ToPlainTextString()); }
/// <summary> /// 转换html源码为xml格式 /// </summary> /// <param name="html">html源码</param> /// <returns>xml字符串</returns> /// <param name="TargetTag">需转换的标记名</param> public static string CovertHtmlToXml(string html, string targetTag) { try { XmlDocument doc = new XmlDocument(); XmlNode xmlDeclaration = doc.CreateXmlDeclaration("1.0", "utf-8", null); doc.AppendChild(xmlDeclaration); // 借助htmlparser解析html内容 Parser parser = Parser.CreateParser(html, "GBK"); // 筛选出指定的节点 TagNameFilter tnf = new TagNameFilter(targetTag); NodeList nodes = parser.Parse(tnf); // 创建根节点 XmlElement root = doc.CreateElement("Tags"); TagNode tagNode = null; Hashtable ht = null; XmlAttribute attr = null; XmlElement parent = null; for (int i = 0; i < nodes.Size(); i++) { tagNode = nodes[i] as TagNode; parent = doc.CreateElement(tagNode.TagName); // 添加属性 ht = tagNode.Attributes; foreach (DictionaryEntry ent in ht) { // 查看属性名是否合法 if (Regex.IsMatch(ent.Key.ToString(), validName)) { attr = doc.CreateAttribute(ent.Key.ToString()); attr.Value = ent.Value.ToString(); parent.Attributes.Append(attr); } }// end foreach (DictionaryEntry ent in ht) AppendChild(tagNode, parent, doc); root.AppendChild(parent); } doc.AppendChild(root); return(doc.OuterXml); //throw new Exception("给定的html文本必须至少包含一个" + targetTag + "节点"); } catch (Exception ex) { throw new Exception("转换html内容出错:" + ex.Message); } }
/// <summary> /// 获取BodyHtml,去除Script /// </summary> public string GetBodyHtml(string html) { HtmlPage page = GetPage(html); NodeList nodelist = page.Body; NodeFilter filter = new TagNameFilter("script"); NodeList childnode = nodelist.ExtractAllNodesThatMatch(filter, true); for (int i = 0; i < childnode.Size(); i++) { nodelist.Remove(childnode[i]); } return(nodelist.ToHtml()); }
public async Task <PaginatedData <TagOutput> > FindAllTagsAsync(TagNameFilter tagNameFilter, Pagination pagination) { var tags = _tagRepository.FindAll(); if (!string.IsNullOrWhiteSpace(tagNameFilter.Name)) { tags = tagNameFilter.SearchType == SearchType.Equals ? tags.Where(tag => tagNameFilter.Name == tag.Name) : tags.Where(tag => tag.Name.Contains(tagNameFilter.Name !)); } return(await tags .Select(tag => new TagOutput(tag)) .PaginateAsync(pagination)); }
/// <summary> /// 移除 html 某些标签 using 新闻 by LYM /// </summary> /// <param name="html">html内容</param> /// <returns>清理后的html内容</returns> public static String HtmlFilter(String html) { Parser parser = Parser.CreateParser(html, "utf-8"); NodeFilter scriptNode = new TagNameFilter("script"); NodeList nodes = parser.Parse(scriptNode); if (nodes.Count > 0) { HtmlFilter(nodes[0].Page.GetText().Replace(nodes[0].ToHtml(), "")); return(nodes[0].Page.GetText().Replace(nodes[0].ToHtml(), "")); } else { return(html); } }
//根据各种筛选条件,获取到需要的元素,后其看是否改为全Filter public string GetByFilter(string html, FilterModel model)//OR与AND都只能同时接受两个 { string result = ""; if (model.EType.ToLower().Equals("title")) { return(GetTitle(html)); } NodeList nodes = GetTagList(html, model.EType); if (!string.IsNullOrEmpty(model.ID)) { HasAttributeFilter filter = new HasAttributeFilter("id", model.ID); nodes = nodes.ExtractAllNodesThatMatch(filter); } if (!string.IsNullOrEmpty(model.CSS)) { HasAttributeFilter filter = new HasAttributeFilter("class", model.CSS); nodes = nodes.ExtractAllNodesThatMatch(filter); } if (!model.AllowScript) { TagNameFilter filter = new TagNameFilter("script"); nodes.ExtractAllNodesThatMatch(filter, true); } //将图片文件本地化 { TagNameFilter filter = new TagNameFilter("img"); NodeList imgs = nodes.ExtractAllNodesThatMatch(filter, true); for (int i = 0; i < imgs.Count; i++) { ImageTag img = imgs[i] as ImageTag; string savepath = function.VToP(vdir + Path.GetFileName(img.ImageURL)); if (File.Exists(savepath)) { continue; } //避免图片重复下载 img.ImageURL = httpHelp.DownFile(baseurl, img.ImageURL, savepath); } } result = nodes.AsHtml(); if (!string.IsNullOrWhiteSpace(model.Start) && !string.IsNullOrWhiteSpace(model.End)) { result = regHelper.GetValueBySE(result, model.Start, model.End); } return(result); }
public static void parseIndexHtml(string HtmlString, int num, out string href, out string title) { href = ""; title = ""; //进行解析 Parser parser = Parser.CreateParser(HtmlString, "utf-8"); //筛选要查找的对象 这里查找td,封装成过滤器 NodeFilter filter = new TagNameFilter("dd"); new AndFilter(new TagNameFilter("dd"), new HasParentFilter(new AndFilter(new TagNameFilter("dl"), new HasAttributeFilter("class", "txt_box")))); //将过滤器导入筛选,得到对象列表 NodeList nodes = parser.Parse(filter); if (nodes.Size() > num) { INode textnode = nodes[num]; ITag tag = getTag(textnode.FirstChild.NextSibling); href = tag.GetAttribute("href"); title = tag.GetAttribute("title"); } }
public DataTable InventoryData(string strBuff) { #region 分析网页html节点 DataTable dt = new DataTable(); dt.Columns.Add("ArticleNo"); dt.Columns.Add("StoreCode"); dt.Columns.Add("StoreName"); dt.Columns.Add("Inventory"); dt.Columns.Add("Sales"); dt.Columns.Add("Date"); dt.Columns.Add("BarCode"); dt.Columns.Add("TopStoreCode"); dt.Columns.Add("TopStoreName"); Lexer lexer = new Lexer(strBuff); Parser parser = new Parser(lexer); NodeFilter html = new TagNameFilter("Table"); NodeList htmlNodes = parser.Parse(html); for (int j = 1; j <= 1; j++) { lexer = new Lexer(strBuff); parser = new Parser(lexer); html = new TagNameFilter("Table"); htmlNodes = parser.Parse(html); Lexer lexers = new Lexer(htmlNodes[22].ToHtml()); Parser parsers = new Parser(lexers); NodeFilter htmls = new TagNameFilter("TR"); NodeList htmlNode = parsers.Parse(htmls); string strArticleNo = string.Empty; for (int i = 2; i < htmlNode.Count; i++) { if (htmlNode[i].Children[0].Children != null) { strArticleNo = htmlNode[i].Children[0].Children[0].ToHtml().Replace(" ", ""); } string strStoreCode = htmlNode[i].Children[2].Children[0].ToHtml().Split('-')[0]; string strStoreName = htmlNode[i].Children[2].Children[0].ToHtml().Split('-')[1]; string strInventory = htmlNode[i].Children[4].Children[0].ToHtml(); string strSales = htmlNode[i].Children[9].Children[0].ToHtml(); string strDate = DateTime.Now.ToString("yyyy-MM-dd"); //获取条码 string barCode = DataHelper.GetBarCode(strArticleNo, _dtGoods); //获取客户编号及客户简称 string strTopStoreCode = ""; string strTopStoreName = ""; DataHelper.GetTopStrore(strStoreCode, _dtTopEnterprise, ref strTopStoreCode, ref strTopStoreName); DataRow dr = dt.NewRow(); dr[0] = strArticleNo; dr[1] = strStoreCode; dr[2] = strStoreName; dr[3] = strInventory; dr[4] = strSales; dr[5] = strDate; dr[6] = barCode; dr[7] = strTopStoreCode; dr[8] = strTopStoreName; dt.Rows.Add(dr); } } #endregion return(dt); }
private void GrapBaiduMsg(HttpContext context) { string sRslt = GetHtmlStr("http://www.yododo.com/ask/list/"); ClassLibrary.BLL.RouteClass rcBll = new ClassLibrary.BLL.RouteClass(); List <ClassLibrary.Model.RouteClass> rcList = rcBll.GetModelList("classLevel = 3"); Parser parser = Parser.CreateParser(sRslt, "utf-8"); NodeFilter filterUL = new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "miniarea-list clearfix")); NodeList liList = parser.Parse(filterUL); string links = liList[0].ToHtml(); parser = Parser.CreateParser(links, "utf-8"); NodeFilter filterLI = new TagNameFilter("li"); //new NodeClassFilter(typeof(ATag)); NodeList nodelist = parser.Parse(filterLI); //string strGn = nodelist[1].ToHtml(); string strCj = nodelist[0].ToHtml(); //parser = Parser.CreateParser(nodelist.ToHtml(), "utf-8"); NodeFilter filterA = new NodeClassFilter(typeof(ATag)); /*NodeList aGnList = parser.Parse(filterA); * for (int i = 0; i < aGnList.Count; i++) * { * ITag tag = getTag(aGnList[i]); * string url = "http://www.yododo.com" + tag.GetAttribute("href") + "s1"; //已解决 * string className = tag.ToPlainTextString(); * if (className == "全部") continue; * * ClassLibrary.Model.RouteClass model = rcList.Find(delegate(ClassLibrary.Model.RouteClass rc) { return rc.ClassName == className; }); * if (model == null) continue; * * paserData(aGnList[i], url, model.ID); * }*/ parser = Parser.CreateParser(strCj, "utf-8"); NodeList areaCjList = parser.Parse(filterA); for (int i = 0; i < areaCjList.Count; i++) { ITag tag = getTag(areaCjList[i]); string url = "http://www.yododo.com" + tag.GetAttribute("href"); //各洲 string className = tag.ToPlainTextString(); if (className == "全部" || className == "中国") { continue; } parser = Parser.CreateParser(GetHtmlStr(url), "utf-8"); //NodeFilter filterUL = new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "miniarea-list clearfix")); NodeList liListCj = parser.Parse(filterUL); string linksCj = liListCj[0].ToHtml(); parser = Parser.CreateParser(linksCj, "utf-8"); //NodeFilter filterA = new NodeClassFilter(typeof(ATag)); NodeList aCjList = parser.Parse(filterA); for (int j = 0; j < aCjList.Count; j++) { ITag cjtag = getTag(aCjList[j]); string url1 = "http://www.yododo.com" + cjtag.GetAttribute("href") + "s1"; //已解决 string className1 = cjtag.ToPlainTextString(); if (className1 == "全部") { continue; } ClassLibrary.Model.RouteClass model = rcList.Find(delegate(ClassLibrary.Model.RouteClass rc) { return(rc.ClassName == className1); }); if (model == null) { continue; } paserData(aCjList[j], url1, model.ID); } } Print(context, "success"); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_cph_context_RightList1_GridViewPaging1_lblGridViewPagingDesc"))); if (pageList != null && pageList.Count > 0) { string pageStr = pageList[0].ToPlainTextString().Trim(); try { Regex regexPage = new Regex(@"页,共[^页]+页"); Match pageMatch = regexPage.Match(pageStr); pageInt = int.Parse(pageMatch.Value.Replace("页,共", "").Replace("页", "").Trim()); } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "ctl00$ScriptManager1", "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "ctl00$cph_context$RightList1$search", "ctl00$cph_context$RightList1$txtTitle", "ctl00$cph_context$RightList1$GridViewPaging1$txtGridViewPagingForwardTo", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION", "ctl00$cph_context$RightList1$GridViewPaging1$btnForwardToPage" }, new string[] { "ctl00$cph_context$RightList1$update1|ctl00$cph_context$RightList1$GridViewPaging1$btnForwardToPage", "", "", viewState, "标题", "", i.ToString(), "", eventValidation, "GO" }); try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_RightList1_GridView1"))); if (dtList != null && dtList.Count > 0) { TableTag table = dtList[0] as TableTag; for (int j = 1; j < table.RowCount; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; headName = tr.Columns[1].ToPlainTextString().Trim(); releaseTime = tr.Columns[3].ToPlainTextString().Trim(); infoScorce = tr.Columns[2].ToPlainTextString().Trim(); infoType = "通知公告"; ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; infoUrl = "http://jyzx.cb.gov.cn/LGjyzxWeb/SiteManage/" + aTag.Link; if (infoUrl.Contains("%25")) { infoUrl = infoUrl.Replace("%25", "%"); } string htmldetailtxt = string.Empty; try { htmldetailtxt = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.UTF8); } catch { continue; } parser = new Parser(new Lexer(htmldetailtxt)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "text_contend"))); if (noList != null && noList.Count > 0) { ctxHtml = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", ""); infoCtx = noList.AsString().Replace(" ", "").Replace(" ", "").Replace("\t\t", "\t").Replace("\t\t", "\t"); infoCtx = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", ""); msgType = "深圳市龙岗建设工程交易中心"; infoScorce = infoScorce.Replace(" ", ""); NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳龙岗区工程", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(ctxHtml)); NodeList fileList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_cph_context_JyxxUploadFile1_GridView1"))); if (fileList != null && fileList.Count > 0) { string fileHtl = fileList.AsHtml(); parser = new Parser(new Lexer(fileHtl)); NodeFilter aLink = new TagNameFilter("a"); NodeList aList = parser.ExtractAllNodesThatMatch(aLink); if (aList != null && aList.Count > 0) { for (int k = 0; k < aList.Count; k++) { ATag a = aList.SearchFor(typeof(ATag), true)[k] as ATag; if (a != null) { AddBaseFile("http://jyzx.cb.gov.cn/LGjyzxWeb/" + a.Link.Replace("../", ""), a.LinkText, info); } } } } } } } } } } return(null); }
/// <summary> /// 函数名称:ItemRetrival_1 /// 功能说明:用于提取帖子列表页面的url,帖子标题,帖子时间 /// 参数:string url表示帖子列表url /// 参数 ref Encoding encode 用于获取网页字符集编码 /// 参数: ref List<string> listUrl,listTitle,listTime用于存放提取出的各项信息 /// </summary> /// <param name="url"></param> /// <param name="encode"></param> /// <param name="listurl"></param> /// <param name="listtitle"></param> /// <param name="listtime"></param> public static void ItemRetrival_1(string url, ref Encoding encode, ref List <string> listUrl, ref List <string> listTitle, ref List <string> listTime) { //获取网页源码; string rawtext = GetDataFromUrl(url); //将无关的style,script等标签去掉; string reg1 = @"<style[\s\S]+?/style>|<select[\s\S]+?/select>|<script[\s\S]+?/script>|<\!\-\-[\s\S]*?\-\->"; rawtext = new Regex(reg1, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(rawtext, ""); //以下用htmlparser提取源码中的目标table; Lexer lexer = new Lexer(rawtext); //解析出其中的table元素 Parser parser = new Parser(lexer); NodeFilter filter = new TagNameFilter("table"); NodeList htmlNodes = parser.Parse(filter); //去除嵌套式table Regex f1 = new Regex(@"<table.*?>"); for (int i = htmlNodes.Count - 1; i >= 0; i--) { MatchCollection myCollection = f1.Matches(htmlNodes[i].ToHtml()); if (myCollection.Count > 1) { htmlNodes.Remove(i); } } //去除没有时间的table,认为这种table是无效table Regex f2 = new Regex(@"\d\d:\d\d"); for (int i = htmlNodes.Count - 1; i >= 0; i--) { if (!f2.IsMatch(htmlNodes[i].ToHtml())) { htmlNodes.Remove(i); } } //以下程序解析出以上三种目标信息 string final = htmlNodes.ToHtml(); Lexer lex2 = new Lexer(final); Parser par2 = new Parser(lex2); NodeFilter filter2 = new TagNameFilter("tr"); NodeList finalNodes = par2.Parse(filter2); //提取发帖时间信息 RegexFilter rf = new RegexFilter(@"\d\d:\d\d"); for (int i = 0; i < finalNodes.Count; i++) { Lexer lexerTmp = new Lexer(finalNodes[i].ToHtml()); Parser parserTmp = new Parser(lexerTmp); NodeList tmp = parserTmp.Parse(rf); if (tmp.Count > 0) { for (int j = 0; j < tmp.Count; j++) { string temp = tmp[j].ToHtml(); ModifyRawText(ref temp); listTime.Add(temp); } } } //提取帖子URL以及帖子标题 string atagAssist = finalNodes.ToHtml(); Lexer lex3 = new Lexer(atagAssist); Parser par3 = new Parser(lex3); NodeFilter filter3 = new TagNameFilter("a"); NodeList atagNodes = par3.Parse(filter3); string urlpart = new Regex(@"http://.*?(?=/)").Match(url).Value; for (int i = 0; i < atagNodes.Count; i++) { ATag link = (ATag)atagNodes.ElementAt(i); string temp1 = link.GetAttribute("href"); string temp2 = link.StringText; if (temp1 != null && !new Regex("http").IsMatch(temp1)) //如果提取出的url为相对url,则加上域名补全为绝对url { temp1 = urlpart + temp1; //将提取出的url构造完整,形成完整的url } ModifyRawText(ref temp2); listUrl.Add(temp1); listTitle.Add(temp2); } }
static void download_url(string url) { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"; request.Timeout = 30000; try { using (HttpWebResponse response = (HttpWebResponse)request.GetResponse()) { if (response.StatusCode == HttpStatusCode.OK) { using (Stream s = response.GetResponseStream()) { using (StreamReader sr = new StreamReader(s, Encoding.UTF8)) { string html = sr.ReadToEnd(); string encode = HttpUtility.HtmlDecode(html); download_pic(encode); Lexer lexer = new Lexer(encode); Parser par = new Parser(lexer); NodeFilter nodefilter = new TagNameFilter("a"); NodeList nodes = par.ExtractAllNodesThatMatch(nodefilter); for (int i = 0; i < nodes.Count; i++) { ITag tag = nodes[i] as ITag; bool isexist = false; foreach (string ss in links) { if (ss == tag.GetAttribute("href")) { isexist = true; break; } } if (!isexist) { links.Add(tag.GetAttribute("href")); Console.WriteLine("accessing " + "http://taylorpictures.net/" + tag.GetAttribute("href")); using (FileStream fs = new FileStream(@"e:/Photos/crawl_log.txt", FileMode.Append)) { byte[] bytes = Encoding.UTF8.GetBytes("accessing " + "http://taylorpictures.net/" + tag.GetAttribute("href") + "\r\n"); fs.Write(bytes, 0, bytes.Length); } download_url("http://taylorpictures.net/" + tag.GetAttribute("href")); } else { continue; } } } } } else { Console.WriteLine("Error"); } } } catch { Console.WriteLine("404"); } }
/// <summary> /// 函数名称:ItemRetrival_2 /// 功能说明:用于提取帖子列表页面的url,帖子标题,帖子时间 /// 参数:string url表示帖子列表url /// 参数 ref Encoding encode 用于获取网页字符集编码 /// 参数: ref List<string> listUrl,listTitle,listTime用于存放提取出的各项信息 /// /// </summary> /// <param name="url"></param> /// <param name="encode"></param> /// <param name="listurl"></param> /// <param name="listtitle"></param> /// <param name="listtime"></param> public static void ItemRetrival_2(string url, ref Encoding encode, ref List <string> listUrl, ref List <string> listTitle, ref List <string> listTime) { //获取网页源码; string rawtext = GetDataFromUrl(url); string reg1 = @"<style[\s\S]+?/style>|<select[\s\S]+?/select>|<script[\s\S]+?/script>|<\!\-\-[\s\S]*?\-\->"; rawtext = new Regex(reg1, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(rawtext, ""); //将无关的style,script等标签去掉; //以下操作用于提取帖子页面的发帖时间、帖子URL,帖子标题等信息 //用htmlparser获取目标li元素 Lexer lexer = new Lexer(rawtext); Parser parser = new Parser(lexer); NodeFilter filter = new TagNameFilter("li");//解析出其中的li元素 NodeList htmlNodes = parser.Parse(filter); //去掉其中不含有时间的条目 Regex f2 = new Regex(@"\d\d:\d\d"); for (int i = htmlNodes.Count - 1; i >= 0; i--) { if (!f2.IsMatch(htmlNodes[i].ToHtml())) { htmlNodes.Remove(i); } } RegexFilter rf = new RegexFilter(@"\d\d:\d\d"); string final = htmlNodes.ToHtml(); for (int i = 0; i < htmlNodes.Count; i++) { Lexer lexerTmp = new Lexer(htmlNodes[i].ToHtml()); Parser parserTmp = new Parser(lexerTmp); NodeList tmp = parserTmp.Parse(rf); if (tmp.Count > 0) { for (int j = 0; j < tmp.Count; j++) { string temp = tmp[j].ToHtml(); ModifyRawText(ref temp); listTime.Add(temp); } } } //提取帖子url和标题 string atagAssist = htmlNodes.ToHtml(); Lexer lex3 = new Lexer(atagAssist); Parser par3 = new Parser(lex3); NodeFilter filter3 = new TagNameFilter("a"); NodeList atagNodes = par3.Parse(filter3); for (int i = 0; i < atagNodes.Count; i++) { string urlpart = new Regex(@"http://.*?(?=/)").Match(url).Value; ATag link = (ATag)atagNodes.ElementAt(i); string temp1 = link.GetAttribute("href"); string temp2 = link.StringText; if (temp1 != null && !new Regex("http").IsMatch(temp1)) //如果提取出的url为相对url,则加上域名补全为绝对url { temp1 = urlpart + temp1; //将提取出的url构造完整,形成完整的url } ModifyRawText(ref temp2); listUrl.Add(temp1); listTitle.Add(temp2); } }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "AspNetPager1"))); if (tdNodes != null && tdNodes.Count > 0) { string htlPage = tdNodes.ToHtml(); parser = new Parser(new Lexer(htlPage)); NodeFilter filer = new TagNameFilter("a"); NodeList pageList = parser.ExtractAllNodesThatMatch(filer); if (pageList != null && pageList.Count > 0) { for (int i = pageList.Count - 1; i >= 0; i--) { try { ATag aTag = pageList.SearchFor(typeof(ATag), true)[i] as ATag; string pageTemp = aTag.Link.Replace("main.aspx?flg=3&id=6&page=", ""); pageInt = int.Parse(pageTemp); break; } catch (Exception ex) { } } } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&page=" + i.ToString()), Encoding.UTF8); } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "760"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; beginDate = tr.Columns[2].ToPlainTextString().Trim(); ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; prjName = aTag.LinkText; InfoUrl = "http://www.uho.cn/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("bordercolor", "#FFFFFF"), new TagNameFilter("table"))); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).ToLower().Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n").Replace("<br/>", "\r\n"); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmldetail = regexHtml.Replace(htmldetail, ""); } catch (Exception ex) { continue; } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("bordercolor", "#ffffff"), new TagNameFilter("table"))); inviteCtx = dtnode.AsString(); specType = "其他"; msgType = "深圳市友和保险经纪有限公司"; inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.Default); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "fenye123"))); if (tdNodes != null && tdNodes.Count > 0) { string pageTemp = tdNodes.AsString().Replace(" ", "").Trim(); try { pageInt = int.Parse(ToolHtml.GetRegexString(pageTemp, "共", "页")); } catch (Exception ex) { } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode("http://www.sz-otc.com/zhaobiao/index_" + i.ToString()) + ".html", Encoding.Default); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "zhaobiao_list"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 0; j < nodeList.Count; j++) { string htl = string.Empty; htl = nodeList[j].ToHtml(); Parser ul = new Parser(new Lexer(htl)); NodeFilter filter = new TagNameFilter("li"); NodeList liList = ul.ExtractAllNodesThatMatch(filter); if (liList != null && liList.Count > 0) { for (int k = 0; k < liList.Count; k++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; ATag aTag = liList.SearchFor(typeof(ATag), true)[k] as ATag; InfoUrl = "http://www.sz-otc.com" + aTag.Link; prjName = aTag.LinkText.Replace("[新]", "").Replace(" ", ""); if (prjName.Contains("]")) { try { int beg = prjName.IndexOf("]"); prjName = prjName.Substring(beg + 1, prjName.Length - beg - 1); } catch { } } string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.Default); } catch { return(null); } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("class", "right_content"), new TagNameFilter("div"))); if (dtnode != null && dtnode.Count > 0) { HtmlTxt = dtnode.ToHtml(); inviteCtx = dtnode.AsString().Replace(" ", "").Replace(" ", ""); string invite = inviteCtx.Replace("点击", "\r\n").Replace("发布人", "\r\n"); specType = "其他"; msgType = "深圳市东方招标有限公司"; if (string.IsNullOrEmpty(prjName)) { Regex regexName = new Regex(@"(工程名称|项目名称)(:|:)[^\r\n]+\r\n"); prjName = regexName.Match(inviteCtx).Value.Replace("工程名称", "").Replace("项目名称", "").Replace(":", "").Replace(":", "").Trim(); } Regex regex = new Regex(@"(工程编号|招标编号)(:|:)[^\r\n]+\r\n"); code = regex.Match(invite).Value.Replace("工程编号", "").Replace("招标编号", "").Replace(":", "").Replace(":", "").Trim(); Regex regexAddress = new Regex(@"(地址|项目地址)(:|:)[^\r\n]+\r\n"); prjAddress = regexAddress.Match(inviteCtx).Value.Replace("地址", "").Replace("项目地址", "").Replace(":", "").Replace(":", "").Trim(); Regex regexUnit = new Regex(@"(招标单位|招标机构)(:|:)[^\r\n]+\r\n"); buildUnit = regexUnit.Match(inviteCtx).Value.Replace("招标单位", "").Replace("招标机构", "").Replace(":", "").Replace(":", "").Trim(); Regex regexCar = new Regex(@"(开始日期|发布日期)(:|:)[^\r\n]+\r\n"); beginDate = regexCar.Match(invite).Value.Replace("开始日期", "").Replace("发布日期", "").Replace(":", "").Replace(":", "").Trim(); if (!string.IsNullOrEmpty(beginDate)) { string time = string.Empty; for (int leng = 0; leng < beginDate.Length; leng++) { if (leng < 10) { time += beginDate.Substring(leng, 1); } } beginDate = time; } specType = "其他"; msgType = "深圳市东方招标有限公司"; if (buildUnit == "") { buildUnit = ""; } inviteType = ToolHtml.GetInviteTypes(prjName); InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, otherType, InfoUrl, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= 20) { return(list); } } } } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("name", "__ec_pages"))); if (pageList != null && pageList.Count > 0) { try { SelectTag selectTag = pageList[0] as SelectTag; pageInt = selectTag.OptionTags.Length; } catch { } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "ec_i", "topicChrList_20070702_crd", "topicChrList_20070702_f_a", "topicChrList_20070702_p", "topicChrList_20070702_s_name", "id", "method", "__ec_pages", "topicChrList_20070702_rd", "topicChrList_20070702_f_name", "topicChrList_20070702_f_ldate" }, new string[] { "topicChrList_20070702", "20", "", i.ToString(), "", "709", "view", i.ToString(), "20", "", "" }); try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasAttributeFilter("id", "topicChrList_20070702_table"), new TagNameFilter("table"))); if (dtList != null && dtList.Count > 0) { TableTag table = dtList[0] as TableTag; for (int j = 3; j < table.RowCount; j++) { TableRow tr = table.Rows[j]; string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; headName = tr.Columns[1].ToPlainTextString().Trim(); releaseTime = tr.Columns[2].ToPlainTextString().Trim(); infoType = "政策法规"; ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; infoUrl = "http://www.szzfcg.cn/portal/documentView.do?method=view&id=" + aTag.Link.Replace("/viewer.do?id=", ""); string htmldeil = string.Empty; try { htmldeil = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(infoUrl), Encoding.UTF8); } catch { continue; } Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); htmldeil = regexHtml.Replace(htmldeil, ""); parser = new Parser(new Lexer(htmldeil)); NodeFilter filter = new TagNameFilter("body"); NodeList noList = parser.ExtractAllNodesThatMatch(filter); if (noList != null && noList.Count > 0) { ctxHtml = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", ""); infoCtx = noList.AsString().Replace(" ", "").Replace(" ", "").Replace("\t\t", "\t").Replace("\t\t", "\t"); infoCtx = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", ""); msgType = "深圳政府采购"; infoScorce = infoScorce.Replace(" ", ""); NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType); list.Add(info); if (crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
public DataTable OrderData(string strBuff) { #region 分析网页html节点 DataTable dt = new DataTable(); dt.Columns.Add("ArticleNo"); dt.Columns.Add("StoreCode"); dt.Columns.Add("StoreName"); dt.Columns.Add("OrderQuantity"); dt.Columns.Add("OrderAmount"); dt.Columns.Add("OrderNo"); dt.Columns.Add("BarCode"); dt.Columns.Add("TopStoreCode"); dt.Columns.Add("TopStoreName"); Lexer lexerOrder = new Lexer(strBuff); Parser parserOrder = new Parser(lexerOrder); NodeFilter htmlOrder = new TagNameFilter("Table"); NodeList htmlNodeOrders = parserOrder.Parse(htmlOrder); Lexer lexerOrders = new Lexer(htmlNodeOrders[21].ToHtml()); Parser parserOrders = new Parser(lexerOrders); NodeFilter htmlOrderss = new TagNameFilter("A"); NodeList htmlNodeOrder = parserOrders.Parse(htmlOrderss); for (int i = 0; i < htmlNodeOrder.Count; i++) { string strOrderNumber = htmlNodeOrder[i].Children[0].ToHtml(); if (htmlNodeOrder[i].ToHtml().Contains("*")) { strOrderNumber = htmlNodeOrder[i].Children[1].ToHtml(); } string strStoreCode = string.Empty; if (htmlNodeOrder[i] is ITag) { ITag tag = (htmlNodeOrder[i] as ITag); if (!tag.IsEndTag()) { if (tag.Attributes != null && tag.Attributes.Count > 0) { if (tag.Attributes["HREF"] != null) { strStoreCode = tag.Attributes["HREF"].ToString(); strStoreCode = strStoreCode.Split('=')[2]; } } } } if (!string.IsNullOrEmpty(strStoreCode)) { string path2 = AppDomain.CurrentDomain.BaseDirectory + @"Areas\\BusinessData\Test" + $@"\b_{strOrderNumber.Trim('*')}_{strStoreCode}.txt";; string strBuff2 = DataHelper.Read(path2, Encoding.Default); if (string.IsNullOrEmpty(strBuff2)) { continue; } Lexer lexer = new Lexer(strBuff2); Parser parser = new Parser(lexer); NodeFilter html = new TagNameFilter("Table"); NodeList htmlNodes = parser.Parse(html); Lexer lexers = new Lexer(htmlNodes[23].ToHtml()); Parser parsers = new Parser(lexers); NodeFilter htmls = new TagNameFilter("TR"); NodeList htmlNode = parsers.Parse(htmls); string strArticleNo = string.Empty; for (int j = 2; j < htmlNode.Count; j++) { strArticleNo = htmlNode[j].Children[1].Children[0].ToHtml(); string strStoreName = string.Empty; string strOrderQuantity = string.Empty; if (strOrderNumber.Contains("*")) { strOrderQuantity = htmlNode[j].Children[6].Children[0].ToHtml(); } else { strOrderQuantity = htmlNode[j].Children[5].Children[0].ToHtml(); } //获取条码 string barCode = DataHelper.GetBarCode(strArticleNo, _dtGoods); //获取客户编号及客户简称 string strTopStoreCode = ""; string strTopStoreName = ""; DataHelper.GetTopStrore(strStoreCode, _dtTopEnterprise, ref strTopStoreCode, ref strTopStoreName); DataRow dr = dt.NewRow(); dr[0] = strArticleNo; dr[1] = strStoreCode; dr[2] = strStoreName; dr[3] = strOrderQuantity; dr[4] = "0"; dr[5] = strOrderNumber.Trim('*'); dt.Rows.Add(dr); } } } #endregion return(dt); }
public DataTable InventoryData(string strBuff) { #region 分析网页html节点 DataTable dt = new DataTable(); dt.Columns.Add("ArticleNo"); //商品编码 dt.Columns.Add("StoreCode"); //区域/仓库 dt.Columns.Add("StoreName"); //区域/仓库 dt.Columns.Add("Inventory"); //总库存 dt.Columns.Add("Date"); dt.Columns.Add("BarCode"); dt.Columns.Add("TopStoreCode"); dt.Columns.Add("TopStoreName"); for (int j = 1; j <= 1; j++)//分页 { Lexer lexer = new Lexer(strBuff); Parser parser = new Parser(lexer); NodeFilter html = new TagNameFilter("Table"); NodeList htmlNodes = parser.Parse(html); string strArticleNo = ""; int rowspan = 1; int count = 1; for (int i = 9; i <= htmlNodes[1].Children.Count - 4; i++)//空格也算一个元素 { if (count == rowspan) { rowspan = 1; } string strStoreCode; string strStoreName; if (rowspan > 1)//多列并排 取上一列的值 { strStoreCode = htmlNodes[1].Children[i].Children[1].ToPlainTextString(); strStoreName = htmlNodes[1].Children[i].Children[1].ToPlainTextString(); if (count <= rowspan) { count++; } } else { //HTML解析商品编码 var htmlArticleNo = htmlNodes[1].Children[i].Children[1].ToHtml().Replace(" ", " ").Replace("\"", ""); rowspan = GetStrArticleNo(htmlArticleNo); strArticleNo = htmlNodes[1].Children[i].Children[1].ToPlainTextString(); strStoreCode = htmlNodes[1].Children[i].Children[9].ToPlainTextString(); strStoreName = htmlNodes[1].Children[i].Children[9].ToPlainTextString(); count = 1; } string strInventory = htmlNodes[1].Children[i].Children[htmlNodes[1].Children[i].Children.Count - 2].ToPlainTextString(); string strDate = DateTime.Now.ToString("yyyy-MM-dd"); //获取条码 string barCode = DataHelper.GetBarCode(strArticleNo, _dtGoods); //获取客户编号及客户简称 string strTopStoreCode = ""; string strTopStoreName = ""; DataHelper.GetTopStrore(strStoreCode, _dtTopEnterprise, ref strTopStoreCode, ref strTopStoreName); DataRow dr = dt.NewRow(); dr[0] = strArticleNo; dr[1] = strStoreCode; dr[2] = strStoreName; dr[3] = strInventory; dr[4] = strDate; dr[5] = barCode; dr[6] = strTopStoreCode; dr[7] = strTopStoreName; dt.Rows.Add(dr); i++; } } #endregion return(dt); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); string htl = string.Empty; string cookiestr = string.Empty; string viewState = string.Empty; int page = 1; string eventValidation = string.Empty; try { htl = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8, ref cookiestr); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(htl)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("id", "ctl00_Main_paging_LblPageCount"))); if (tdNodes.Count > 0) { try { page = int.Parse(tdNodes[0].ToPlainTextString().Trim()); } catch { return(list); } } for (int i = 1; i <= page; i++) { if (i > 1) { if (i < 3) { viewState = this.ToolWebSite.GetAspNetViewState(htl); eventValidation = this.ToolWebSite.GetAspNetEventValidation(htl); } NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "ctl00$ScriptManager1", "__EVENTTARGET", "__EVENTARGUMENT", "ctl00$Main$ddl_type", "ctl00$Main$txt_Title", "ctl00$Main$paging$txtPageIndex", "__VIEWSTATE", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION", "__ASYNCPOST", "ctl00$Main$paging$btnForward.x", "ctl00$Main$paging$btnForward.y" }, new string[] { "ctl00$UpdatePanel1|ctl00$Main$paging$btnForward", string.Empty, string.Empty, "1", string.Empty, i.ToString(), viewState, "", eventValidation, "true", "8", "9" }); try { htl = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } catch (Exception ex) { continue; } } parser = new Parser(new Lexer(htl)); NodeList tableList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "ctl00_Main_GV_New"))); if (tableList != null && tableList.Count > 0) { TableTag table = (TableTag)tableList[0]; for (int j = 1; j < table.RowCount; j++) { string pUrl = string.Empty, pInfoSource = string.Empty, pBeginDate = string.Empty, pBuilTime = string.Empty, pEndDate = string.Empty, pConstUnit = string.Empty, pSuperUnit = string.Empty, pDesignUnit = string.Empty, pProspUnit = string.Empty, pInviteArea = string.Empty, pBuildArea = string.Empty, pPrjClass = string.Empty, pProClassLevel = string.Empty, pChargeDept = string.Empty, pPrjAddress = string.Empty, pBuildUnit = string.Empty, pPrjCode = string.Empty, PrjName = string.Empty, pCreatetime = string.Empty; TableRow tr = table.Rows[j]; PrjName = tr.Columns[2].ToPlainTextString().Trim(); pBuildUnit = tr.Columns[3].ToPlainTextString().Trim(); string aLink = string.Empty; ATag aTag = new ATag(); try { aLink = tr.ToHtml().Replace("ondblclick", "href").Replace("<tr", "<a"); aLink = aLink.Remove(aLink.IndexOf("<td")) + "</a>"; parser = new Parser(new Lexer(aLink)); NodeFilter a = new TagNameFilter("a"); NodeList aList = parser.ExtractAllNodesThatMatch(a); if (aList != null && aList.Count > 0) { aTag = aList.SearchFor(typeof(ATag), true)[0] as ATag; } if (aTag.Link.Contains("PrjManager") || aTag.Link.Contains("View")) { pUrl = aTag.Link.Remove(aTag.Link.IndexOf("View")).Replace("&", "&") + "View"; int index = pUrl.IndexOf("PrjManager"); pUrl = "http://www.szbajs.gov.cn/SiteManage/" + pUrl.Substring(index, pUrl.Length - index); } else { continue; } } catch (Exception ex) { continue; } string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(pUrl), Encoding.UTF8); } catch (Exception) { continue; } Parser parserdetail = new Parser(new Lexer(htmldetail)); NodeList dtnode = parserdetail.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "data_con"))); if (dtnode.Count > 0 && dtnode != null) { pInfoSource = dtnode.AsString().Replace(" ", ""); Regex regPrjAddr = new Regex(@"(工程地点|工程地址)(:|:)[^\r\n]+\r\n"); pPrjAddress = regPrjAddr.Match(pInfoSource).Value.Replace("工程地址", "").Replace("工程地点", "").Replace(":", "").Replace(":", "").Trim(); Regex regpProspUnit = new Regex(@"勘查单位(:|:)[^\r\n]+\r\n"); pProspUnit = regpProspUnit.Match(pInfoSource).Value.Replace("勘查单位", "").Replace(":", "").Replace(":", "").Trim(); Regex regpDesignUnit = new Regex(@"设计单位(:|:)[^\r\n]+\r\n"); pDesignUnit = regpDesignUnit.Match(pInfoSource).Value.Replace("设计单位", "").Replace(":", "").Replace(":", "").Trim(); Regex regpSuperUnit = new Regex(@"监理单位(:|:)[^\r\n]+\r\n"); pSuperUnit = regpSuperUnit.Match(pInfoSource).Value.Replace("监理单位", "").Replace(":", "").Replace(":", "").Trim(); Regex regConst = new Regex(@"施工单位(:|:)[^\r\n]+\r\n"); pConstUnit = regConst.Match(pInfoSource).Value.Replace("施工单位", "").Replace(":", "").Replace(":", "").Trim(); if (string.IsNullOrEmpty(pChargeDept)) { pChargeDept = "宝安区建设局"; } BaseProject info = ToolDb.GenBaseProject("广东省", pUrl, "深圳市宝安区", pInfoSource, pBuilTime, pBeginDate, pEndDate, pConstUnit, pSuperUnit, pDesignUnit, pProspUnit, pInviteArea, pBuildArea, pPrjClass, pProClassLevel, pChargeDept, pPrjAddress, pBuildUnit, pPrjCode, PrjName, pCreatetime, "深圳市宝安区建设局"); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new List <InviteInfo>(); int sqlCount = 0; int count = 0; //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8); } catch { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList pageNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("select"), new HasAttributeFilter("name", "__ec_pages"))); if (pageNode != null && pageNode.Count > 0) { SelectTag selectTag = pageNode[0] as SelectTag; pageInt = selectTag.OptionTags.Length; } string cookiestr = string.Empty; for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "ec_i", "topicChrList_20070702_crd", "topicChrList_20070702_f_a", "topicChrList_20070702_p", "topicChrList_20070702_s_name", "id", "method", "__ec_pages", "topicChrList_20070702_rd", "topicChrList_20070702_f_name", "topicChrList_20070702_f_ldate" }, new string[] { "topicChrList_20070702", "20", string.Empty, i.ToString(), string.Empty, "1660", "view", (i - 1).ToString(), "20", string.Empty, string.Empty }); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8, ref cookiestr); } catch { } } parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "topicChrList_20070702_table"))); if (tdNodes != null && tdNodes.Count > 0) { TableTag table = tdNodes[0] as TableTag; for (int t = 3; t < table.RowCount; t++) { string code = string.Empty, buildUnit = string.Empty, prjName = string.Empty, prjAddress = string.Empty, inviteCtx = string.Empty, inviteType = string.Empty, specType = string.Empty, beginDate = string.Empty, endDate = string.Empty, remark = string.Empty, inviteCon = string.Empty, InfoUrl = string.Empty, CreateTime = string.Empty, msgType = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[t]; prjName = tr.Columns[2].ToPlainTextString().Trim().ToRegString(); //try //{ inviteType = tr.Columns[3].ToPlainTextString().Trim(); beginDate = tr.Columns[4].ToPlainTextString().Trim(); //} //catch { DateTime beginDa = DateTime.Today; beginDate = beginDa.ToString("yyyy-MM-dd HH:mm:ss"); } ATag aTag = tr.Columns[2].SearchFor(typeof(ATag), true)[0] as ATag; Regex regexLink = new Regex(@"id=[^-]+"); InfoUrl = "http://www.szzfcg.cn/portal/documentView.do?method=view&" + regexLink.Match(aTag.Link).Value; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Trim(); Parser dtlparserHTML = new Parser(new Lexer(htmldetail)); NodeList dtnodeHTML = dtlparserHTML.ExtractAllNodesThatMatch(new TagNameFilter("body")); HtmlTxt = dtnodeHTML.AsHtml(); htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n"); } catch (Exception ex) { } Parser dtlparser = new Parser(new Lexer(htmldetail)); NodeList dtnode = dtlparser.ExtractAllNodesThatMatch(new TagNameFilter("body")); inviteCtx = dtnode.AsString().Replace(" ", "").Replace("\t", "").Trim("\r\n".ToCharArray()).Replace("“", "“").Replace("”", "”").Replace("双击鼠标自动滚屏[打印此页][关闭此页]", ""); inviteCtx = System.Web.HttpUtility.HtmlDecode(inviteCtx); Regex regCtx = new Regex(@"[\r\n]+"); inviteCtx = regCtx.Replace(inviteCtx, "\r\n"); Regex regcode = new Regex(@"(招标编号|项目编号)(:|:)([0-9]|[A-Za-z]|[-])+"); code = regcode.Match(inviteCtx).Value.Replace("招标编号", "").Replace("项目编号", "").Replace(":", "").Replace(":", "").Trim(); if (string.IsNullOrEmpty(inviteCtx) || string.IsNullOrEmpty(HtmlTxt)) { parser = new Parser(new Lexer(htmldetail)); NodeFilter filter = new TagNameFilter("body"); NodeList ctxList = parser.ExtractAllNodesThatMatch(filter); inviteCtx = ctxList.AsString(); HtmlTxt = ctxList.AsHtml(); } if (string.IsNullOrEmpty(inviteCtx) || string.IsNullOrEmpty(HtmlTxt)) { Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>|<style[^<]*</style>|<xml[^<]*</xml>"); HtmlTxt = regexHtml.Replace(htmldetail, ""); inviteCtx = Regex.Replace(HtmlTxt, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", ""); } msgType = "深圳政府采购"; specType = "政府采购"; prjAddress = "深圳市"; if (inviteType.Contains("160")) { inviteType = ToolHtml.GetInviteTypes(prjName); } InviteInfo info = ToolDb.GenInviteInfo("广东省", "深圳政府采购", "", string.Empty, code, prjName, prjAddress, buildUnit, beginDate, endDate, inviteCtx, remark, msgType, inviteType, specType, string.Empty, InfoUrl, HtmlTxt); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields, this.ExistsUpdate, this.ExistsHtlCtx)) { count++; parser = new Parser(new Lexer(htmldetail)); NodeList fileNode = parser.ExtractAllNodesThatMatch(new TagNameFilter("a")); if (fileNode != null && fileNode.Count > 0) { for (int f = 0; f < fileNode.Count; f++) { ATag tag = fileNode[f] as ATag; if (tag.IsAtagAttach()) { try { BaseAttach attach = null; if (tag.Link.ToLower().Contains(".com") || tag.Link.ToLower().Contains(".cn")) { attach = ToolHtml.GetBaseAttachByUrl(tag.Link.Replace("&", "&"), tag.LinkText, info.Id, "SiteManage\\Files\\InviteAttach\\"); } else { attach = ToolHtml.GetBaseAttachByUrl("http://www.szzfcg.cn" + tag.Link.Replace("&", "&"), tag.LinkText, info.Id, "SiteManage\\Files\\InviteAttach\\"); } if (attach != null) { ToolDb.SaveEntity(attach, "SourceID,AttachServerPath"); } } catch { } } } } if (count >= 10) { count = 0; Thread.Sleep(1000 * 300); } } } } } return(list); }
public DataTable SalesData(string strBuff) { #region 分析网页html节点 DataTable dt = new DataTable(); dt.Columns.Add("ArticleNo"); //商品编码 dt.Columns.Add("BarCode"); //商品条码 dt.Columns.Add("StoreCode"); //区域/仓库 dt.Columns.Add("StoreName"); //区域/仓库 dt.Columns.Add("Sales"); //销售 dt.Columns.Add("Date"); dt.Columns.Add("TopStoreCode"); dt.Columns.Add("TopStoreName"); for (int j = 1; j <= 1; j++)//分页 { Lexer lexer = new Lexer(strBuff); Parser parser = new Parser(lexer); NodeFilter html = new TagNameFilter("Table"); NodeList htmlNodes = parser.Parse(html); for (int i = 9; i <= htmlNodes[1].Children.Count - 4; i++)//空格也算一个元素 { string strArticleNo = htmlNodes[1].Children[i].Children[1].ToPlainTextString(); string salesBarCode = htmlNodes[1].Children[i].Children[5].ToPlainTextString(); string strStoreCode = htmlNodes[1].Children[i].Children[11].ToPlainTextString(); string strStoreName = htmlNodes[1].Children[i].Children[11].ToPlainTextString(); string strSales = ""; if (htmlNodes[1].Children[i].Children != null) { int sold = int.Parse(htmlNodes[1].Children[i].Children[13].ToPlainTextString()); int back = int.Parse(htmlNodes[1].Children[i].Children[15].ToPlainTextString()); strSales = (sold - back).ToString(); } string strDate = DateTime.Now.ToString("yyyy-MM-dd"); //获取货号 if (string.IsNullOrEmpty(strArticleNo)) { strArticleNo = DataHelper.GetArticalNo(salesBarCode, _dtGoods); } //获取客户编号及客户简称 string strTopStoreCode = ""; string strTopStoreName = ""; DataHelper.GetTopStrore(strStoreCode, _dtTopEnterprise, ref strTopStoreCode, ref strTopStoreName); DataRow dr = dt.NewRow(); dr[0] = strArticleNo; dr[1] = salesBarCode; dr[2] = strStoreCode; dr[3] = strStoreName; dr[4] = strSales; dr[5] = strDate; dr[6] = strTopStoreCode; dr[7] = strTopStoreName; dt.Rows.Add(dr); i++; } } #endregion return(dt); }
protected override IList ExecuteCrawl(bool crawlAll) { IList list = new ArrayList(); //取得页码 int pageInt = 1; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; string cookiestr = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), Encoding.UTF8); } catch (Exception ex) { return(list); } Parser parser = new Parser(new Lexer(html)); NodeList tdNodes = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("id", "AspNetPager1"))); if (tdNodes != null && tdNodes.Count > 0) { string htlPage = tdNodes.ToHtml(); parser = new Parser(new Lexer(htlPage)); NodeFilter filer = new TagNameFilter("a"); NodeList pageList = parser.ExtractAllNodesThatMatch(filer); if (pageList != null && pageList.Count > 0) { for (int i = pageList.Count - 1; i >= 0; i--) { try { ATag aTag = pageList.SearchFor(typeof(ATag), true)[i] as ATag; string pageTemp = aTag.Link.Replace("main.aspx?flg=10&id=6&page=", ""); pageInt = int.Parse(pageTemp); break; } catch (Exception ex) { } } } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl + "&page=" + i.ToString()), Encoding.UTF8); } catch { } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "760"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 0; j < table.RowCount; j++) { string prjName = string.Empty, buildUnit = string.Empty, bidUnit = string.Empty, bidMoney = string.Empty, code = string.Empty, bidDate = string.Empty, beginDate = string.Empty, endDate = string.Empty, bidType = string.Empty, specType = string.Empty, InfoUrl = string.Empty, msgType = string.Empty, bidCtx = string.Empty, prjAddress = string.Empty, remark = string.Empty, prjMgr = string.Empty, otherType = string.Empty, HtmlTxt = string.Empty; TableRow tr = table.Rows[j]; beginDate = tr.Columns[2].ToPlainTextString().Trim(); ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; prjName = aTag.LinkText; InfoUrl = "http://www.uho.cn/" + aTag.Link; string htmldetail = string.Empty; try { htmldetail = this.ToolWebSite.GetHtmlByUrl(InfoUrl, Encoding.UTF8).ToLower().Replace(" ", "").Replace("</br>", "\r\n").Replace("<br>", "\r\n").Replace("<br/>", "\r\n"); Regex regexHtml = new Regex(@"<script[^<]*</script>|<\?xml[^/]*/>"); htmldetail = regexHtml.Replace(htmldetail, ""); } catch (Exception ex) { continue; } parser = new Parser(new Lexer(htmldetail)); NodeList deaiList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "sdg"))); if (deaiList != null && deaiList.Count > 0) { HtmlTxt = deaiList.AsHtml(); bidCtx = HtmlTxt.ToCtxString(); code = bidCtx.GetRegexBegEnd("编号:", ")", 50); if (!string.IsNullOrEmpty(code)) { code = code.ToUpper(); } string ctx = string.Empty; parser = new Parser(new Lexer(HtmlTxt)); NodeList bidNode = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("width", "100%"))); if (bidNode != null && bidNode.Count > 0) { TableTag bidTable = bidNode[0] as TableTag; try { for (int k = 0; k < 1; k++) { for (int d = 0; d < bidTable.Rows[k].ColumnCount; d++) { ctx += bidTable.Rows[k].Columns[d].ToNodePlainString() + ":"; ctx += bidTable.Rows[k + 1].Columns[d].ToNodePlainString() + "\r\n"; } } bidUnit = ctx.GetBidRegex(); bidMoney = ctx.GetMoneyRegex(); } catch { } } if (string.IsNullOrEmpty(bidUnit)) { bidUnit = bidCtx.GetBidRegex(); } if (bidMoney == "0") { bidMoney = bidCtx.GetMoneyRegex(); } specType = "其他"; msgType = "深圳市友和保险经纪有限公司"; prjName = ToolDb.GetPrjName(prjName); prjName = prjName.Replace(" ", "").Trim(); bidType = ToolHtml.GetInviteTypes(prjName); BidInfo info = ToolDb.GenBidInfo("广东省", "深圳社会招标", "", string.Empty, code, prjName, buildUnit, beginDate, bidUnit, beginDate, endDate, bidCtx, string.Empty, msgType, bidType, specType, otherType, bidMoney, InfoUrl, prjMgr, HtmlTxt); list.Add(info); if (!crawlAll && list.Count >= this.MaxCount) { return(list); } } } } } return(list); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GridView1")), true), new TagNameFilter("table"))); if (pageList != null && pageList.Count > 0) { try { TableTag table = pageList[0] as TableTag; pageInt = table.Rows[0].ColumnCount + 1; } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { try { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION", "sel", "beginDate", "endDate", "infotitle" }, new string[] { "GridView1", "Page$" + i.ToString(), viewState, "", eventValidation, "1", "", "", "" }); html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList nodeList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GridView1"))); if (nodeList != null && nodeList.Count > 0) { TableTag table = nodeList[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; headName = tr.Columns[1].ToNodePlainString(); releaseTime = tr.Columns[3].ToPlainTextString().GetDateRegex(); if (string.IsNullOrEmpty(releaseTime)) { releaseTime = tr.Columns[3].ToPlainTextString().GetDateRegex("yyyy/MM/dd"); } infoScorce = tr.Columns[2].ToNodePlainString(); infoType = "通知公告"; infoUrl = "http://www.szjsjy.com.cn/Notify/" + tr.Columns[1].GetATagHref();//"http://www.szjsjy.com.cn/Notify/InformContent.aspx?id=117750";// string htldtl = string.Empty; try { htldtl = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.UTF8).GetJsString(); } catch { continue; } parser = new Parser(new Lexer(htldtl)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("background", "../img/A-3_17.gif"))); if (noList != null && noList.Count > 0) { ctxHtml = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", ""); infoCtx = noList.AsString().Replace(" ", "").Replace(" ", "").Replace("\t\t", "\t").Replace("\t\t", "\t"); infoCtx = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", "").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n").Replace("\r\n\r\n", "\r\n"); msgType = MsgTypeCosnt.ShenZhenMsgType; infoScorce = infoScorce.Replace(" ", ""); NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(ctxHtml)); NodeFilter aLink = new TagNameFilter("a"); NodeList aList = parser.ExtractAllNodesThatMatch(aLink); if (aList != null && aList.Count > 0) { for (int k = 0; k < aList.Count; k++) { ATag a = aList[k].GetATag(); if (a != null) { if (!a.LinkText.Contains("返回")) { try { BaseAttach obj = ToolHtml.GetBaseAttach("http://www.szjsjy.com.cn/" + a.Link.Replace("../", ""), a.LinkText, info.Id); if (obj != null) { ToolDb.SaveEntity(obj, string.Empty); } } catch { } } } } } } } } } } } return(null); }
protected override IList ExecuteCrawl(bool crawlAll) { //取得页码 int pageInt = 1, sqlCount = 0; string html = string.Empty; string viewState = string.Empty; string eventValidation = string.Empty; try { html = this.ToolWebSite.GetHtmlByUrl(SiteUrl, Encoding.UTF8); } catch (Exception ex) { return(null); } Parser parser = new Parser(new Lexer(html)); NodeList pageList = parser.ExtractAllNodesThatMatch(new AndFilter(new HasParentFilter(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GridView1")), true), new TagNameFilter("table"))); if (pageList != null && pageList.Count > 0) { try { TableTag table = pageList[0] as TableTag; int pageAtag = table.Rows[0].ColumnCount; pageInt = int.Parse((table.Rows[0].SearchFor(typeof(ATag), true)[pageAtag - 2] as ATag).LinkText); } catch { pageInt = 1; } } for (int i = 1; i <= pageInt; i++) { if (i > 1) { viewState = this.ToolWebSite.GetAspNetViewState(html); eventValidation = this.ToolWebSite.GetAspNetEventValidation(html); NameValueCollection nvc = this.ToolWebSite.GetNameValueCollection(new string[] { "__EVENTTARGET", "__EVENTARGUMENT", "__VIEWSTATE", "__VIEWSTATEENCRYPTED", "__EVENTVALIDATION", "sel", "beginDate", "endDate", "infotitle" }, new string[] { "GridView1", "Page$" + i.ToString(), viewState, "", eventValidation, "1", "", "", "" }); try { html = this.ToolWebSite.GetHtmlByUrl(this.ToolWebSite.UrlEncode(SiteUrl), nvc, Encoding.UTF8); } catch { continue; } } parser = new Parser(new Lexer(html)); NodeList dtList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("id", "GridView1"))); if (dtList != null && dtList.Count > 0) { TableTag table = dtList[0] as TableTag; for (int j = 1; j < table.RowCount - 1; j++) { string headName = string.Empty, releaseTime = string.Empty, infoScorce = string.Empty, msgType = string.Empty, infoUrl = string.Empty, ctxHtml = string.Empty, infoCtx = string.Empty, infoType = string.Empty; TableRow tr = table.Rows[j]; headName = tr.Columns[1].ToPlainTextString().Trim(); infoScorce = tr.Columns[2].ToPlainTextString().Trim(); releaseTime = tr.Columns[3].ToPlainTextString().Trim(); infoType = "通知公告"; ATag aTag = tr.Columns[1].SearchFor(typeof(ATag), true)[0] as ATag; infoUrl = "http://www.szjsjy.com.cn/Notify/" + aTag.Link; string htmldetailtxt = string.Empty; try { htmldetailtxt = this.ToolWebSite.GetHtmlByUrl(infoUrl, Encoding.UTF8); } catch { continue; } parser = new Parser(new Lexer(htmldetailtxt)); NodeList noList = parser.ExtractAllNodesThatMatch(new AndFilter(new TagNameFilter("table"), new HasAttributeFilter("background", "../img/A-3_17.gif"))); if (noList != null && noList.Count > 0) { ctxHtml = noList.AsHtml().Replace("<br/>", "\r\n").Replace("<BR/>", ""); infoCtx = noList.AsString().Replace(" ", "").Replace(" ", "").Replace("\t\t", "\t").Replace("\t\t", "\t"); infoCtx = Regex.Replace(infoCtx, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase).Replace(" ", "").Replace("\t", ""); msgType = "深圳市建设工程交易中心"; infoScorce = infoScorce.Replace(" ", ""); NotifyInfo info = ToolDb.GenNotifyInfo(headName, releaseTime, infoScorce, msgType, infoUrl, ctxHtml, "广东省", "深圳市工程", string.Empty, infoCtx, infoType); if (!crawlAll && sqlCount >= this.MaxCount) { return(null); } else { sqlCount++; if (ToolDb.SaveEntity(info, this.ExistCompareFields)) { parser = new Parser(new Lexer(ctxHtml)); NodeFilter aLink = new TagNameFilter("a"); NodeList aList = parser.ExtractAllNodesThatMatch(aLink); if (aList != null && aList.Count > 0) { for (int k = 0; k < aList.Count; k++) { ATag a = aList.SearchFor(typeof(ATag), true)[k] as ATag; if (a != null) { if (!a.LinkText.Contains("返回")) { AddBaseFile("http://www.szjsjy.com.cn/" + a.Link.Replace("../", ""), a.LinkText, info); } } } } } } } } } } return(null); }