public static List<Product> LoadGoods(string html) { Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter filter = new HasAttributeFilter("class", "product"); NodeList products = parser.ExtractAllNodesThatMatch(filter); List<Product> result = new List<Product>(); for (int i = 0; i < products.Count; i++) { try { Product p = new Product(); string pname = "", ppromo = "", pimg = ""; decimal pprice = 0; ITag product = products[i] as ITag; //name NodeFilter nameFilter = new HasAttributeFilter("class", "product-title"); NodeList names = product.Children.ExtractAllNodesThatMatch(nameFilter, true); ITag name = names[0] as ITag; pname = name.ToPlainTextString().Trim(); //name NodeFilter priceFilter = new HasAttributeFilter("class", "product-price"); NodeList prices = product.Children.ExtractAllNodesThatMatch(priceFilter, true); ITag price = prices[0] as ITag; pprice = Decimal.Parse(price.ToPlainTextString().Trim().Substring(7)); //img NodeFilter imgFilter = new TagNameFilter("img"); NodeList imgs = product.Children.ExtractAllNodesThatMatch(imgFilter, true); ITag img = imgs[0] as ITag; pimg = img.GetAttribute("DATA-KS-LAZYLOAD"); //promo NodeFilter promoFilter = new HasAttributeFilter("class", "promo"); NodeList promos = product.Children.ExtractAllNodesThatMatch(promoFilter, true); if (promos.Count > 0) { ITag promo = promos[0] as ITag; ppromo = promo.GetAttribute("data-promo"); } p.img = pimg; p.name = pname; p.price = pprice; p.promo = ppromo; result.Add(p); } catch { } } return result; }
public NodeList GetListUrl(string url) { Parser parser = ParserHelp.GetParser(url); NodeFilter filter = new HasAttributeFilter("class", "list_title"); NodeList list = new NodeList(); list = parser.ExtractAllNodesThatMatch(filter); return list; }
static void GetStoryOfRevolution() { StreamReader reader = new StreamReader("catalogue.htm"); Lexer lexer = new Lexer(reader.ReadToEnd()); Parser parser = new Parser(lexer); HasAttributeFilter linkFilterByParent = new HasAttributeFilter("class", "row zhangjieUl"); HasAttributeFilter linkFilterByClass = new HasAttributeFilter("class", "fontStyle2 colorStyleLink"); AndFilter linkFilter = new AndFilter(new HasParentFilter(linkFilterByParent, true), linkFilterByClass); NodeList linkNodeList = parser.Parse(linkFilter); List<string> linkUrlList = new List<string>(linkNodeList.Size()); List<string> chapterHtmlContentList = new List<string>(linkNodeList.Size()); HttpWebRequest httpWebRequest; StreamReader chapterReader = null; for (int i = 0; i < linkNodeList.Size(); i++) { ATag linkNode = (ATag)linkNodeList[i]; linkUrlList.Add(linkNode.Link); httpWebRequest = HttpWebRequest.CreateHttp("http://www.mlxiaoshuo.com" + linkUrlList[linkUrlList.Count - 1]); chapterReader = new StreamReader(new BufferedStream(httpWebRequest.GetResponse().GetResponseStream(), 4 * 200 * 1024)); string chapterHtmlContent = chapterReader.ReadToEnd(); chapterHtmlContentList.Add(chapterHtmlContent); Console.WriteLine("第" + (i + 1) + "个页面获取完毕!"); } chapterReader.Close(); HasAttributeFilter praghFilter = new HasAttributeFilter("class", "textP fontStyle2 colorStyleText"); StreamWriter writer = new StreamWriter("革命逸事.txt"); for (int i = 0; i < chapterHtmlContentList.Count; i++) { writer.WriteLine("第" + (i + 1) + "章"); lexer = new Lexer(chapterHtmlContentList[i]); parser = new Parser(lexer); NodeList praghNodeList = parser.Parse(praghFilter); if (praghNodeList.Size() == 1) { for (int j = 0; j < praghNodeList[0].Children.Size(); j++) { if (praghNodeList[0].Children[j].GetType().Equals(typeof(ParagraphTag))) { ParagraphTag praghTag = (ParagraphTag)praghNodeList[0].Children[j]; writer.WriteLine(" " + praghTag.StringText); } } writer.WriteLine(); } else { Console.WriteLine("第" + (i + 1) + "页中,判断段落的标准出错!"); } } writer.Close(); }
public List<PlayTime> getPlayTimes(string xmlFile) { Match match = Regex.Match(xmlFile, @"\d\d\d\d"); string cinemaID = match.Value;//电影院的ID List<PlayTime> playTimes = new List<PlayTime>(); string html = File.ReadAllText(xmlFile); Lexer lexer = new Lexer(html); Parser playParser = new Parser(lexer); NodeFilter playFilter = new HasAttributeFilter("CLASS", "px14"); NodeList playTimeList = playParser.ExtractAllNodesThatMatch(playFilter); if (playTimeList.Count >= 1) { for (int i = 0; i < playTimeList.Count; i++) { PlayTime playTime = new PlayTime(); ITag playTag = (playTimeList[i] as ITag); ITag idTag = (playTag.FirstChild as ITag); if (idTag.Attributes != null) { string strID = idTag.Attributes["HREF"].ToString(); Match idMatch = Regex.Match(strID, @"\d\d\d\d\d\d"); if (idMatch.Success) { playTime.MovieID = int.Parse(idMatch.Value); } else { Match strMatch = Regex.Match(strID, @"\d\d\d\d\d"); if (strMatch.Success) { playTime.MovieID = int.Parse(strMatch.Value); } } } string strTime = playTag.NextSibling.NextSibling.ToPlainTextString(); char[] a = {'上','映'}; strTime = strTime.Trim(a); playTime.Playtime = DateTime.Parse(strTime); playTime.CinemaID = int.Parse(cinemaID); playTime.PlayState = true; playTimes.Add(playTime); } return playTimes; } return null; }
public Job GetJobInfoParser(string url) { Job jobinfo = new Job(); string title = string.Empty; string description = string.Empty; DateTime dt = DateTime.Now; string email = string.Empty; Parser parser = new Parser(new HttpProtocol(new Uri(url))); NodeFilter detail = new HasAttributeFilter("class", "d_left"); NodeList nodeDetail = parser.ExtractAllNodesThatMatch(detail); if (nodeDetail == null || nodeDetail.Count == 0) { return jobinfo; } description = GetDetailString(nodeDetail); Match m = Regex.Match(description, @"发布时间:(?<date>\d\d\d\d-\d{1,2}\-\d{1,2} \d{1,2}\:\d{1,2})"); dt = DateTime.Now; if (m.Success && m.Groups["date"].Success && DateTime.TryParse(m.Groups["date"].Value, out dt)) { } Match emailMatch = Regex.Match(description, @"([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)"); if (emailMatch.Success) { email = emailMatch.Value; } Match telMatch = Regex.Match(description, @"(1[3|5|8][0-9]|15[0|3|6|7|8|9]|18[8|9])\d{8}"); if (telMatch.Success) { jobinfo.tel = telMatch.Value; } jobinfo.category_id = Catalog.id; jobinfo.title = title; jobinfo.description = description; jobinfo.created_on = dt; jobinfo.is_active = true; jobinfo.city_id = Catalog.city.id; jobinfo.sp1010url = url; jobinfo.poster_email = email; return jobinfo; }
public bool getCinemaFive(string html, out string dining, out string park, out string gameCenter, out string intro3D, out string introVIP) { //string vip = string.Empty; //string html = File.ReadAllText(url); dining = string.Empty; park = string.Empty; gameCenter = string.Empty; intro3D = string.Empty; introVIP = string.Empty; Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter nodeFilter = new HasAttributeFilter("CLASS", "c_000"); NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter); for (int i = 0; i < nodeList.Count; i++) { INode node = nodeList[i]; ITag tagPar = (node.Parent as ITag); ITag tagSib = (node.PreviousSibling as ITag); if (tagSib.Attributes["CLASS"] != null) { switch (tagSib.Attributes["CLASS"].ToString()) { case "ico_cside1 mr12": dining = tagPar.ToPlainTextString(); break; case "ico_cside2 mr12": park = tagPar.ToPlainTextString(); break; case "ico_cside3 mr12": gameCenter = tagPar.ToPlainTextString(); break; case "ico_cside5 mr12": intro3D = tagPar.ToPlainTextString(); break; case "ico_cside7 mr12": introVIP = tagPar.ToPlainTextString(); break; } } } return true; //throw new NotImplementedException(); }
public static string getAttValue(string html, string tag, string attribute, string attValue, string attributeV) { Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); string value = string.Empty; NodeFilter nodeFilter = new HasAttributeFilter(attribute, attValue); NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter); if (nodeList.Count == 1) { ITag tagNode = (nodeList[0] as ITag); if (tagNode.Attributes != null) { return tagNode.Attributes[attributeV].ToString(); } } //for (int i = 0; i < nodeList.Count; i++) //{ // INode node = nodeList[i]; // ITag tagNode = (node as ITag); // if (tagNode.Attributes != null && tagNode.Attributes.Count > 0) // { // foreach (string key in tagNode.Attributes.Keys) // { // if (key.Contains("<TAGNAME>")) // { // continue; // } // if (key.Contains(attribute)) // { // if (tagNode.Attributes[key].ToString() == attValue) // { // value = tagNode.Attributes[attributeV].ToString(); // return value; // } // } // } // } //} return null; }
public Job GetDetail(string url) { Job info = new Job(); Parser parser = ParserHelp.GetParser(url); NodeFilter miaoShu = new HasAttributeFilter("id", "miaoshu"); NodeFilter mainBox = new HasAttributeFilter("class", "mainBox"); NodeFilter orfilter = new OrFilter(miaoShu, mainBox); NodeList list = new NodeList(); list = parser.Parse(orfilter); if (list == null || list.Count < 2) { return info; } GetMiaoShu(list, ref info); GetContartInfo(list, ref info); return info; }
//分析HtmlContents中给定索引条目的内容,提取信息 private static void GetInfoFromHtml(int index) { //使用Winista.HtmlParser库解析HTML //建立HTML分析工具对象 Lexer lexer = new Lexer(HtmlContents[index]); Parser parser = new Parser(lexer); //按属性的过滤器:两个参数分别代表要过滤的属性和属性值 HasAttributeFilter nameFilter = new HasAttributeFilter("class", "lrg"); HasAttributeFilter priceFilter = new HasAttributeFilter("class", "bld lrg red"); //获得所有满足过滤条件的HTML节点 NodeList nameList = parser.Parse(nameFilter); for (int j = 0; j < nameList.Size(); j++) { //确定节点nameList[j]为Span类型的标签;HttpUtility.HtmlDecode方法把HTML编码转为文本编码,使中文正常显示 string name = HttpUtility.HtmlDecode(((Span)nameList[j]).StringText); //Parent表示该HTML节点的父节点 //NextSobling表示该HTML节点的下一个兄弟节点 //Children表示该HTML节点的所有孩子节点组成的集合 //ExtractAllNodesThatMatch表示获取所有满足给定过滤器条件的节点,两个参数分别代表过滤器和是否进入孩子节点中迭代查找 //注意:对Winista.HtmlParser来说,“空文本节点”也是一个节点(在IE的开发者工具中显示“空文本节点”,而Chrome则不显示);形似<del>内容</ del>在Children中会表达成三个节点 NodeList priceList = nameList[j].Parent.Parent.NextSibling.NextSibling.Children.ExtractAllNodesThatMatch(priceFilter, true); if (priceList.Size() == 1) { string priceStr = ((Span)priceList[0]).StringText; double price = Double.Parse(priceStr.Substring(2, priceStr.Length - 2)); TradeList.Add(new Commodity(name, price, "RMB")); } else { badRecordCount++; } } Console.WriteLine("第" + (index + 1) + "个页面处理完成!"); //保存当前页面到本地文件 //StreamWriter writer = new StreamWriter("searchresult"+i+".html"); //writer.Write(s); //writer.Close(); }
public List<ATag> ParseCatelog(string html) { List<ATag> atags = new List<ATag>(); Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter nav = new HasAttributeFilter("class", "fenlei_list"); NodeList navNodes = parser.Parse(nav); NodeFilter catelog = new LinkRegexFilter(@"^\.\./product/index\.php\?cplm\=\-\d\d\d\-$"); catelog = new HasChildFilter(catelog); NodeList catelogNodes = navNodes[0].Children.ExtractAllNodesThatMatch(catelog); if(catelogNodes==null){ return atags; } int length = catelogNodes.Count; for (int i=0;i<length;i++) { INode node = catelogNodes[i]; ATag a = node.Children[0] as ATag; atags.Add(a); } return atags; }
public void SpiderCurrentPage(int idx) { ParserConf.GetConfiguration().RootPath = AppDomain.CurrentDomain.BaseDirectory; string url = Catalog.sp1010 + string.Format("index{0}.html", idx); Parser parser; NodeList nodeList=null; int count = 0; bool sign=true; while (sign && count<5) { SpiderEventLog.WriteSourceLog("Spider " + url, url, EventLogEntryType.Information); try { parser = new Parser(new HttpProtocol(new Uri(url))); } catch (Exception ex) { SpiderEventLog.WriteWarningLog("获取列表页面数据错误:" + url + "\r\n" + ex.ToString()); return; } if (parser == null) { return; } sign = false; try { NodeFilter filter = new HasAttributeFilter("class", "Linklist"); nodeList = parser.ExtractAllNodesThatMatch(filter); } catch (Exception ex) { SpiderEventLog.WriteWarningLog("获取列表页面数据错误:" + url + "\r\n" + ex.ToString()); sign = true; } count++; } if (nodeList == null) { return; } int length = nodeList.Count; for (int i = 0; i < length; i++) { ATag node = nodeList[i] as ATag; if (IsExistJob(node.Link)) { SpiderEventLog.WriteLog(string.Format("职务 [{0}] 已存在",node.LinkText)); continue; } Job jobinfo = GetJobInfoParser(node.Link); jobinfo.title = Regex.Replace(node.LinkText,"&[^&;]{0,};", "",RegexOptions.IgnoreCase); ConsoleColor color = Console.ForegroundColor; Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("=".PadLeft(120,'=')); Console.WriteLine("title:{0}", jobinfo.title); Console.WriteLine("url:{0}", jobinfo.sp1010url); Console.WriteLine("tel:{0}", jobinfo.tel); Console.WriteLine("email:{0}", jobinfo.poster_email); Console.WriteLine("desc:{0}", jobinfo.description); Console.WriteLine("=".PadLeft(120,'=')); Console.ForegroundColor = color; InsertJobInfo(jobinfo); } }
/// <summary> /// 配置各种HTML节点过滤器 /// </summary> private static void MakeFilters() { HasAttributeFilter fansListFilterByClass = new HasAttributeFilter("class", "cnfList"); HasAttributeFilter fanListFilterByNodeType = new HasAttributeFilter("node-type", "userListBox"); AndFilter fansListFilter = new AndFilter(fanListFilterByNodeType, fansListFilterByClass); fanFilter = new AndFilter(new HasParentFilter(fansListFilter, false), new HasAttributeFilter("class", "clearfix S_line1")); HasAttributeFilter portraitFilterByParent = new HasAttributeFilter("class", "left"); portraitFilter = new AndFilter(new HasParentFilter(portraitFilterByParent, false), new HasAttributeFilter("class", "face mbspace")); HasAttributeFilter fanNameFilterByParent = new HasAttributeFilter("class", "con_left"); fanNameFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "name")); fanConnectFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "connect")); fanInfoFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "info")); followMethodFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "from W_textb")); }
private static void MakeFilters() { NodeClassFilter dlFilter = new NodeClassFilter(typeof(DefinitionList)); HasAttributeFilter searchListFilter = new HasAttributeFilter("id", "searchList"); poiListFilter = new AndFilter(new HasParentFilter(searchListFilter, false), dlFilter); poiFilter = new NodeClassFilter(typeof(DefinitionListBullet)); tasteFilter = new HasAttributeFilter("class", "score1"); environmentFilter = new HasAttributeFilter("class", "score2"); serviceFilter = new HasAttributeFilter("class", "score3"); averageFilter = new HasAttributeFilter("class", "average"); commentFilter = new AndFilter(new HasAttributeFilter("class", "B"), new HasAttributeFilter("module", "list-readreview")); HasAttributeFilter nameFilterByParent = new HasAttributeFilter("class", "shopname"); nameFilter = new AndFilter(new HasParentFilter(nameFilterByParent, false), new HasAttributeFilter("class", "BL")); addressFilter = new HasAttributeFilter("class", "address"); tagsFilter = new HasAttributeFilter("class", "tags"); }
private void ParseProductShowPhoto(NodeList nodes) { NodeFilter show = new HasAttributeFilter("class", "Picture220"); NodeList showNodes = nodes.ExtractAllNodesThatMatch(show, true); ImageTag showTag = showNodes[0] as ImageTag; showTag.ImageURL = showTag.ImageURL.Replace("../../", "http://rrxf.cn/"); Console.WriteLine(showTag.ImageURL); DownloadPicture(showTag.ImageURL); }
private void ParseProductDemoPhoto(NodeList nodes) { NodeFilter photo = new HasAttributeFilter("class", "Picture40"); NodeList photoNodes = nodes.ExtractAllNodesThatMatch(photo, true); DownloadPictures(photoNodes); }
private void ParsePorductDescribe(NodeList nodes) { NodeFilter miao = new HasAttributeFilter("class", "miao"); NodeList miaoArea = nodes.ExtractAllNodesThatMatch(miao, true); NodeFilter pictures = new NodeClassFilter(typeof(ImageTag)); NodeList pictureNodes = miaoArea.ExtractAllNodesThatMatch(pictures, true); DownloadPictures(pictureNodes); string miaoshu = miaoArea.AsHtml(); miaoshu = Regex.Replace(miaoshu, @"http\://(www\.|)rrxf\.cn/", pictureURL + "/", RegexOptions.IgnoreCase); miaoshu = Regex.Replace(miaoshu, @"(pic|bigpic)/", "$1_", RegexOptions.IgnoreCase); miaoshu = miaoshu.Replace("-", "_"); Console.WriteLine(miaoshu); }
public NodeList GetDetailPageForHtml(string html) { Parser parse = GetParser(html); NodeFilter showidFilter = new HasAttributeFilter("id", "detail"); NodeFilter showclassFilter = new HasAttributeFilter("class", "box"); AndFilter showFilter = new AndFilter(showidFilter, showclassFilter); NodeFilter contentidFilter = new HasAttributeFilter("id", "J_DivItemDesc"); NodeFilter contentclassFilter = new HasAttributeFilter("class", "content"); AndFilter contentFilter = new AndFilter(contentidFilter, contentclassFilter); OrFilter orFitler = new OrFilter(showFilter, contentFilter); return parse.Parse(orFitler); }
protected string getPaperID(string paper_name) { string html_page = _HttpUtil.getPaperIDHTML(paper_name); if (html_page == null || html_page == "") { return null; } Parser p = new Parser(new Lexer(html_page)); TagNameFilter tag_f = new TagNameFilter("A"); HasAttributeFilter attr_f = new HasAttributeFilter("target", "_blank"); HasChildFilter child_f = new HasChildFilter(new PaperFilter(paper_name)); AndFilter af = new AndFilter(tag_f,attr_f); AndFilter aff = new AndFilter(af, child_f); NodeList childs = p.ExtractAllNodesThatMatch(aff); if (childs == null || childs.Count <= 0) { //Paper not found return null; } //TODO Multi Paper found INode node = childs[0]; if (node is ITag) { ITag t = node as ITag; string href = t.GetAttribute("href"); if (href != null && href != "") { string [] sp = href.Split(new char[]{'/'}); return sp[sp.Length - 1].Split(new char[]{'.'})[0]; } } //Not Found return null; }
public static List<string> getValues(string html, string tag, string attribute, string attValue) { Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); string value = string.Empty; List<string> values = new List<string>(); NodeFilter nodeFilter = new HasAttributeFilter(attribute, attValue); NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter); for (int i = 0; i < nodeList.Count; i++) { INode node = nodeList[i]; ITag tagNode = (node as ITag); if (tagNode.Attributes != null && tagNode.Attributes.Count > 0) { value = tagNode.ToPlainTextString(); values.Add(value); } } //for (int i = 0; i < nodeList.Count; i++) //{ // INode node = nodeList[i]; // ITag tagNode = (node as ITag); // if (tagNode.Attributes != null && tagNode.Attributes.Count > 0) // { // foreach (string key in tagNode.Attributes.Keys) // { // if (key.Contains("<TAGNAME>")) // { // continue; // } // if (key.Contains(attribute)) // { // if (tagNode.Attributes[key].ToString() == attValue) // { // value = tagNode.ToPlainTextString(); // values.Add(value); // } // } // } // } //} return values; }
public void GetLinkForPage(string url) { Lexer lexer = new Lexer(GetHtml(url)); Parser parse = new Parser(lexer); parse.Encoding = "gb2312"; NodeFilter linkFilter = new LinkRegexFilter(@"^http\://item\.taobao\.com/item\.htm\?id\=\d+$"); NodeFilter classFilter = new HasAttributeFilter("class", "EventCanSelect"); AndFilter andFilter = new AndFilter(linkFilter, classFilter); NodeList result = parse.Parse(andFilter); int length = result.Count; for (int i = 0; i < length; i++) { ItemLink.Add(result[i]); } }
/// <summary> /// 辅助函数:从HTML中获得用户信息 /// </summary> /// <param name="currentUserHtml">包含微博用户信息的HTML文本</param> private void GetUserInfoFromHtml(string currentUserHtml) { //配置相关的过滤器 HasAttributeFilter nickNameFilter = new HasAttributeFilter("class", "name"); HasAttributeFilter remarkNameFilter = new HasAttributeFilter("class", "CH"); HasAttributeFilter linkUrlFilter = new HasAttributeFilter("class", "pf_lin S_link1"); HasAttributeFilter selfIntroFilter = new HasAttributeFilter("class", "pf_intro bsp"); HasAttributeFilter tagsFilter = new HasAttributeFilter("class", "S_func1"); HasAttributeFilter profileFilter = new HasAttributeFilter("class", "tags"); Lexer lexer = new Lexer(currentUserHtml); Parser parser = new Parser(lexer); //获取微博名 NodeList nickNameNodeList = parser.ExtractAllNodesThatMatch(nickNameFilter); if (nickNameNodeList.Size() == 1) { user.NickName = ((Span)nickNameNodeList[0]).ToPlainTextString(); } else { Console.WriteLine("判断微博名的标准出错!"); } //注意此处:如果要重复使用parser,一定要在本次使用“完”、下次使用前调用reset,否则会出错 parser.Reset(); //获取备注名称 NodeList remarkNameNodeList = parser.ExtractAllNodesThatMatch(remarkNameFilter); if (remarkNameNodeList.Size() == 1 && remarkNameNodeList[0].GetType().Equals(typeof(Span))) { string str = ((Span)remarkNameNodeList[0]).ToPlainTextString(); //去掉头尾的括号 user.RemarkName = str.Substring(1, str.Length - 2); } else { Console.WriteLine("判断微博备注名称的标准出错!"); } parser.Reset(); //获取微博链接地址 NodeList linkUrlNodeList = parser.ExtractAllNodesThatMatch(linkUrlFilter); if (linkUrlNodeList.Size() == 1 && linkUrlNodeList[0].GetType().Equals(typeof(ATag))) { user.LinkURL = ((ATag)linkUrlNodeList[0]).StringText; } else { Console.WriteLine("判断微博链接地址的标准出错!"); } parser.Reset(); //获取自我描述 NodeList selfIntroNodeList = parser.ExtractAllNodesThatMatch(selfIntroFilter); if (selfIntroNodeList.Size() == 1 && selfIntroNodeList[0].Children[1].GetType().Equals(typeof(Span))) { user.SelfIntroduction = ((Span)selfIntroNodeList[0].Children[1]).GetAttribute("TITLE"); } else { Console.WriteLine("判断自我描述的标准出错!"); } parser.Reset(); //获取标签 NodeList tagsNodeList = parser.ExtractAllNodesThatMatch(tagsFilter); string str2 = ""; for (int i = 0; i < tagsNodeList.Size(); i++) { if (tagsNodeList[i].GetType().Equals(typeof(Span))) { str2 += ((Span)tagsNodeList[i]).ToPlainTextString() + " "; } } user.Tags = str2; parser.Reset(); //获取属性信息 NodeList profileNodeList = parser.ExtractAllNodesThatMatch(profileFilter); if (profileNodeList.Size() == 1) { //通过分析发现,有用的信息均处于<a>标记中,所以按<a>标记取。然后再分析是其中的文本还是<em>中的title NodeClassFilter aTagFilter = new NodeClassFilter(typeof(ATag)); NodeList profileList = profileNodeList[0].Children.ExtractAllNodesThatMatch(aTagFilter, true); for (int j = 0; j < profileList.Size(); j++) { ATag aTag = (ATag)profileList[j]; if (aTag.Attributes.Contains("TITLE")) { user.Profile += aTag.GetAttribute("TITLE") + " "; } else { //遇到含有node-type="infoSlide"的节点说明所有属性遍历结束 if (aTag.Attributes.Contains("NODE-TYPE") && aTag.GetAttribute("NODE-TYPE").Equals("infoSlide")) { break; } else { //包含<em>子节点的情况 if (aTag.Children[0].GetType().Equals(typeof(TagNode))) { TagNode tagNode = (TagNode)aTag.Children[0]; user.Profile += tagNode.GetAttribute("TITLE") + " "; } else { //直接把<a>标记包含的文本输出 user.Profile += aTag.StringText + " "; } } } } } else { Console.WriteLine("判断用户属性信息的标准出错!"); } }
/// <summary> /// 配置各种HTML节点过滤器 /// </summary> private static void MakeFilters() { //爬取个人主页时,使用如下过滤器得到包含mid属性的div;mid和maid以及endid相关 idFilter = new List<HasAttributeFilter>(); idFilter.Add(new HasAttributeFilter("class", "WB_feed_type SW_fun ")); //过滤出每条微博的div feedFilter = new HasAttributeFilter("class", "WB_feed_datail S_line2 clearfix"); idFilter.Add(feedFilter); //过滤出包含微博发送者的div:因为转发微博的div也包含属性class="WB_info",所以使用两个过滤器更为可靠 HasAttributeFilter wbDetailFilter = new HasAttributeFilter("class", "WB_detail"); feedAuthorFilter = new AndFilter(new HasAttributeFilter("class", "WB_info"), new HasParentFilter(wbDetailFilter, false)); //过滤出包含微博内容的div:因为转发微博的div也包含属性class="WB_text",所以使用两个过滤器更为可靠 feedContentFilter = new AndFilter(new HasAttributeFilter("class", "WB_text"), new HasAttributeFilter("node-type", "feed_list_content")); //过滤出包含转发微博的div reFeedFilter = new HasAttributeFilter("node-type", "feed_list_forwardContent"); //过滤出转发微博的原发送者的div:因为类似的原因,所以需要两个过滤器 reFeedAuthorFilter = new AndFilter(new HasAttributeFilter("class", "WB_info"), new HasParentFilter(reFeedFilter, true)); //过滤出转发微博的内容:因为类似的原因,所以需要两个过滤器 reFeedContentFilter = new AndFilter(new HasAttributeFilter("class", "WB_text"), new HasAttributeFilter("node-type", "feed_list_reason")); //过滤出已被删除的转发微博(适用于该div位于reFeedFilter过滤出的div下的情况) refeedDeletedFilter1 = new HasAttributeFilter("class", "WB_deltxt"); //过滤出已被删除的转发微博(适用于该div位于<div class="WB_datail">下的情况) refeedDeletedFilter2 = new AndFilter(new HasParentFilter(wbDetailFilter, true), refeedDeletedFilter1); //过滤出包含对原微博转发数的<b>标记 similarFeedCountFilter = new AndFilter(new HasAttributeFilter("class", "S_spetxt"), new HasAttributeFilter("node-type", "followNum")); //过滤出包含对原微博类似转发的标记 HasAttributeFilter similarFeedFilterByParent = new HasAttributeFilter("class", "WB_feed_datail S_line2 clearfix WB_feed_noLine"); similarFeedFilter = new AndFilter(wbDetailFilter, new HasParentFilter(similarFeedFilterByParent, false)); //过滤出包含微博发送地点的div feedLocationFilter = new AndFilter(new HasAttributeFilter("class", "map_data"), new HasParentFilter(wbDetailFilter, false)); //过滤出包含微博发送时间、发送方式、转发数和评论数的div AndFilter feedMetaDataFilter = new AndFilter(new NotFilter(new HasParentFilter(new HasAttributeFilter("class", "WB_media_expand SW_fun2 S_line1 S_bg1"), true)), new HasAttributeFilter("class", "WB_func clearfix")); //过滤出包含转发数和评论数的div AndFilter feedHandleFilter = new AndFilter(new HasParentFilter(feedMetaDataFilter, false), new HasAttributeFilter("class", "WB_handle")); //过滤出包含发送时间和发送方式的div feedFromFilter = new AndFilter(new HasParentFilter(feedMetaDataFilter, false), new HasAttributeFilter("class", "WB_from")); //过滤出包含“赞”数的链接标记 feedLikeFilter = new AndFilter(new HasParentFilter(feedHandleFilter, false), new HasAttributeFilter("action-type", "fl_like")); //过滤出包含转发数的链接标记 feedForwardFilter = new AndFilter(new HasParentFilter(feedHandleFilter, false), new HasAttributeFilter("action-type", "fl_forward")); //过滤出包含评论数的链接标记 feedCommentFilter = new AndFilter(new HasParentFilter(feedHandleFilter, false), new HasAttributeFilter("action-type", "fl_comment")); //过滤出包含微博发送时间的链接标记 feedTimeFilter = new AndFilter(new HasParentFilter(feedFromFilter, false), new HasAttributeFilter("class", "S_link2 WB_time")); //过滤出包含微博发送方式的链接标记 feedSendTypeFilter = new AndFilter(new HasParentFilter(feedFromFilter, false), new HasAttributeFilter("class", "S_link2")); }
public float getCinemaGrade(string html) { //string tag = "dd"; //string attribute = "CLASS"; //string attValue = "total"; //string left = Spider.getValue(html, tag, attribute, attValue); //string tag2 = "dd"; //string attribute2 = "CLASS"; //string attValue2 = "total2"; //string right = Spider.getValue(html, tag2, attribute2, attValue2); //string grade = left + right; ////return float.Parse(grade); //return 1.1f; //throw new NotImplementedException(); Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter nodeFilter = new HasAttributeFilter("CLASS", "point ml12 px18"); NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter); if (nodeList.Count == 1) { INode node = nodeList[0]; ITag tagLeft = (node.FirstChild as ITag); ITag tagRight = (node.LastChild as ITag); string left = tagLeft.ToPlainTextString(); string right = tagRight.ToPlainTextString(); string strGrade = left + right; return float.Parse(strGrade); } return 7.0f; }
public void ParseProduct(ATag a) { string html = GetHtml(a.Link); Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter productArea = new HasAttributeFilter("id", "productyou"); NodeList nodes = parser.ExtractAllNodesThatMatch(productArea); ParseProductTitle(nodes); ParseProductShowPhoto(nodes); ParseProductDemoPhoto(nodes); ParsePorductDescribe(nodes); NodeFilter productAttributeArea = new HasAttributeFilter("class", "chans"); NodeList productAttributeAreaNodes = nodes.ExtractAllNodesThatMatch(productAttributeArea,true); NodeFilter productAttributes = new HasAttributeFilter("class", "cph"); NodeList productAttributeNodes = nodes.ExtractAllNodesThatMatch(productAttributes, true); int length = productAttributeNodes.Count; for (int i = 0; i < length; i++) { INode n = productAttributeNodes[i].Children[0]; string t =n.ToPlainTextString(); if (Regex.Match(t, @"^\s{0,}颜色", RegexOptions.IgnoreCase).Success) { ParseProductColors(n); } Console.WriteLine(); } }
public void ParseProducts(ATag a) { string html = GetHtml(a.Link.Replace("../", "http://rrxf.cn/")); Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter nav = new HasAttributeFilter("class", "photoyi"); NodeList navNodes = parser.Parse(nav); if (navNodes == null) return; int length = navNodes.Count; for (int i = 0; i < length; i++) { ATag link = ParseProductUrl(navNodes[i].ToHtml()); Console.WriteLine(link.Link); ParseProduct(link); } }
private static void ParseProductTitle(NodeList nodes) { NodeFilter title = new HasAttributeFilter("class", "prouductx"); NodeList titleNodes = nodes.ExtractAllNodesThatMatch(title, true); Console.WriteLine(titleNodes[0].ToPlainTextString()); }
protected ArrayList getPaperReferenceByID(ArrayList paper_id) { string html_page = _HttpUtil.getPaperReferenceHTML(paper_id); if (html_page == null || html_page == "") { return null; } Parser p = new Parser(new Lexer(html_page)); TagNameFilter tag_f = new TagNameFilter("div"); HasAttributeFilter attr_f = new HasAttributeFilter("id", "export_container"); AndFilter af = new AndFilter(tag_f, attr_f); NodeList childs = p.ExtractAllNodesThatMatch(af); if (childs == null || childs.Count <= 0) { return null; } INode node = childs[0]; NodeList ref_childs = node.Children; ArrayList ref_list = new ArrayList(); for (int i = 0; i < ref_childs.Count;++i ) { INode tmp = ref_childs[i]; if (tmp is ITag) { ITag tag = tmp as ITag; string str = tag.ToPlainTextString(); str = str.Replace('\r', ' ').Replace('\n',' '); str = str.Substring(str.IndexOf(']') + 1); //str = System.Text.RegularExpressions.Regex.Replace(str, @"^\[*\]$", ""); ref_list.Add(str); } } if (_Progressable != null) { _Progressable.onFinish(ref_list); } return ref_list; }
public List<Model.Play> getPlays(string xmlFile) { Match strCinema = Regex.Match(xmlFile, @"\d\d\d\d"); string cinemaID = strCinema.Value; string html = File.ReadAllText(xmlFile); List<Model.Play> plays = new List<Model.Play>(); Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); //获取影片列表的node string listAttribute = "METHOD"; string listAttValue = "mdShowtime"; NodeFilter movieListFilter = new HasAttributeFilter(listAttribute, listAttValue); NodeList movieNodeList = parser.ExtractAllNodesThatMatch(movieListFilter); if (movieNodeList.Count >= 1) { //获取每个电影的html for (int i = 0; i < movieNodeList.Count; i++) { INode node = movieNodeList[i]; string movieHtml = node.ToHtml(); Lexer movieLexer = new Lexer(movieHtml); Parser movieParser = new Parser(movieLexer); //获取影片ID NodeFilter idFilter = new HasAttributeFilter("CLASS", "c_000"); NodeList idNodes = movieParser.ExtractAllNodesThatMatch(idFilter); string strID = string.Empty; if (idNodes.Count >= 1) { ITag idTag = (idNodes[0] as ITag); if (idTag.Attributes != null) { string str = idTag.Attributes["HREF"].ToString(); Match match = Regex.Match(str, @"\d\d\d\d\d\d"); if (match.Success) { strID = match.Value;//电影的ID } else { Match ma = Regex.Match(str, @"\d\d\d\d\d"); if (ma.Success) { strID = ma.Value; } } //strID = match.Value;//电影的ID } } //获取影片播放时段列表 Lexer lexer2 = new Lexer(movieHtml); Parser movieParser2 = new Parser(lexer2); NodeFilter playFilter = new HasAttributeFilter("_TYPE", "expiry"); //NodeList playNodes = parser.ExtractAllNodesThatMatch(playFilter); NodeList playNodes = movieParser2.ExtractAllNodesThatMatch(playFilter); if (playNodes.Count >= 1) { for (int j = 0; j < playNodes.Count; j++) { Model.Play play = new Model.Play(); ITag playTag = (playNodes[j] as ITag); if (playTag.Attributes != null) { play.CinemaID = int.Parse(cinemaID); play.MovieID = int.Parse(strID); play.PlayID = int.Parse(playTag.Attributes["SHOWTIMEID"].ToString()); string strTime = playTag.Attributes["TIME"].ToString(); if (strTime == null || strTime == "") { continue; } strTime = strTime.Trim(); strTime = strTime.Remove(0, 10); play.PlayName = strTime.Trim(); //ITag tag2 = (playTag.FirstChild as ITag); //string strPrice = tag2.FirstChild.ToPlainTextString(); //playTag.FirstChild.FirstChild.ToPlainTextString(); string strPrice = playTag.FirstChild.NextSibling.FirstChild.NextSibling.ToPlainTextString(); if (strPrice != null&&strPrice!=""&&strPrice!=" ") { strPrice = strPrice.Trim(); strPrice = strPrice.Remove(0, 1); play.Price = float.Parse(strPrice); } else { play.Price = 0f; } plays.Add(play); } } } } return plays; } return null; //throw new NotImplementedException(); }