public LinkStatus ExtractLinks() { if (String.Empty == m_strUrl) { throw new ArgumentException("No URL specified"); } m_Links = new ImageDataCollection(); CreateParser(); if (m_obParser.Lexer.Page.mSource == null) { return LinkStatus.Broken; } INodeFilter obFilter = new NodeClassFilter(typeof(ImageTag)); NodeList collNodes = m_obParser.Parse(obFilter); if (null != collNodes) { PageData obPageData = new PageData(); obPageData.m_strUrl = m_obParser.URL; obPageData.m_iDepth = m_iLevel; for(Int32 i= 0; i < collNodes.Count; i++) { INode obNode = collNodes[i]; ImageData obLinkData = new ImageData(obPageData, obNode as ImageTag); m_Links.Add(obLinkData); } } return LinkStatus.Ok; }
/// <summary> Search given node and pick up any objects of given type.</summary> /// <param name="node">The node to search. /// </param> /// <param name="type">The class to search for. /// </param> /// <returns> A node array with the matching nodes. /// </returns> public static INode[] FindTypeInNode(INode node, System.Type type) { INodeFilter filter; NodeList ret; ret = new NodeList(); filter = new NodeClassFilter(type); node.CollectInto(ret, filter); return (ret.ToNodeArray()); }
private static void MakeFilters() { NodeClassFilter dlFilter = new NodeClassFilter(typeof(DefinitionList)); HasAttributeFilter searchListFilter = new HasAttributeFilter("id", "searchList"); poiListFilter = new AndFilter(new HasParentFilter(searchListFilter, false), dlFilter); poiFilter = new NodeClassFilter(typeof(DefinitionListBullet)); tasteFilter = new HasAttributeFilter("class", "score1"); environmentFilter = new HasAttributeFilter("class", "score2"); serviceFilter = new HasAttributeFilter("class", "score3"); averageFilter = new HasAttributeFilter("class", "average"); commentFilter = new AndFilter(new HasAttributeFilter("class", "B"), new HasAttributeFilter("module", "list-readreview")); HasAttributeFilter nameFilterByParent = new HasAttributeFilter("class", "shopname"); nameFilter = new AndFilter(new HasParentFilter(nameFilterByParent, false), new HasAttributeFilter("class", "BL")); addressFilter = new HasAttributeFilter("class", "address"); tagsFilter = new HasAttributeFilter("class", "tags"); }
/// <summary> /// Gets all links contained in the page /// </summary> /// <returns></returns> public NodeList GetAllOutLinks() { INodeFilter filter = new NodeClassFilter(typeof(LinkTag)); return this.Parse(filter); }
private void ParsePorductDescribe(NodeList nodes) { NodeFilter miao = new HasAttributeFilter("class", "miao"); NodeList miaoArea = nodes.ExtractAllNodesThatMatch(miao, true); NodeFilter pictures = new NodeClassFilter(typeof(ImageTag)); NodeList pictureNodes = miaoArea.ExtractAllNodesThatMatch(pictures, true); DownloadPictures(pictureNodes); string miaoshu = miaoArea.AsHtml(); miaoshu = Regex.Replace(miaoshu, @"http\://(www\.|)rrxf\.cn/", pictureURL + "/", RegexOptions.IgnoreCase); miaoshu = Regex.Replace(miaoshu, @"(pic|bigpic)/", "$1_", RegexOptions.IgnoreCase); miaoshu = miaoshu.Replace("-", "_"); Console.WriteLine(miaoshu); }
/// <summary> /// 辅助函数:以给定的HTML节点为根节点,把其子节点均作为微博内容提取出来 /// </summary> /// <param name="feed">保存微博内容的Feed实例</param> /// <param name="node">作为根节点的HTML节点</param> /// <param name="hasEmTag">子节点中是否含有em标签(只有转发微博中含有em标签),若有em标签,hasEmTag为true,则start初始为false;反之为true</param> /// <returns>返回微博内容字符串</returns> private static string GetContentFromChildren(Feed feed, INode node, bool hasEmTag) { string content = ""; bool start = !hasEmTag; for (int i = 0; i < node.Children.Size(); i++) { Type t = node.Children[i].GetType(); if (start) { if (t.Equals(typeof(TextNode))) { string str = ((TextNode)node.Children[i]).ToPlainTextString(); //遇到“//”说明微博内容提取完成;同时,还要提取“//”之后的一系列转发者 if (str.Length >= 2 && str.Substring(str.Length - 2).Equals("//")) { //去掉“//” str = str.Substring(0, str.Length - 2); content += str; //string reFeedFrom = ((ATag)node.Children[i + 1]).StringText; //if (reFeedFrom[0].Equals('@')) //{ // //去掉“@” // reFeedFrom = reFeedFrom.Substring(1, reFeedFrom.Length - 1); //} //获取转发链 string reFeedFrom = ""; for (int j = i + 1; j < node.Children.Size(); j++) { Type t2 = node.Children[j].GetType(); if (t2.Equals(typeof(ATag)) && ((ATag)node.Children[j]).Attributes.ContainsKey("USERCARD")) { string oneReFeeder = ((ATag)node.Children[j]).StringText; if (oneReFeeder[0].Equals('@')) { //去掉“@” oneReFeeder = oneReFeeder.Substring(1, oneReFeeder.Length - 1); reFeedFrom = reFeedFrom.Insert(0, oneReFeeder + " "); } else { Console.WriteLine("获取转发链时出现错误!此前的转发链为" + reFeedFrom); } } } //最后,把reFeedFrom赋给feed.ReFeedFrom feed.ReFeedFrom = reFeedFrom; break; } content += str; continue; } if (t.Equals(typeof(ATag))) { ATag aTagNode = (ATag)node.Children[i]; //某些情况下,链接标记中不仅仅含有文本节点,还有span标记(以后说不定还会碰到跟奇葩的……),所以提取aTagNode的孩子节点中所有文本节点信息 NodeClassFilter textNodeFilter = new NodeClassFilter(typeof(TextNode)); NodeList nodeList = aTagNode.Children.ExtractAllNodesThatMatch(textNodeFilter, true); for (int j = 0; j < nodeList.Size(); j++) { content += ((TextNode)nodeList[j]).ToPlainTextString(); } continue; } if (t.Equals(typeof(TagNode))) { content += ((TagNode)node.Children[i]).ToPlainTextString(); continue; } if (t.Equals(typeof(ImageTag))) { content += ((ImageTag)node.Children[i]).GetAttribute("TITLE"); continue; } } else { if (t.Equals(typeof(TagNode)) && (((TagNode)(node.Children[i])).TagName.Equals("EM"))) { start = true; } } } //某些情况下最先/后数个字符竟然会是空格和换行符(ASCII码10),瞎了…… char[] shouldRemove = { ' ', (char)10, '\r', '\n' }; content = content.TrimStart(shouldRemove); content = content.TrimEnd(shouldRemove); return content; }
/// <summary> /// 辅助函数:从HTML中获得用户信息 /// </summary> /// <param name="currentUserHtml">包含微博用户信息的HTML文本</param> private void GetUserInfoFromHtml(string currentUserHtml) { //配置相关的过滤器 HasAttributeFilter nickNameFilter = new HasAttributeFilter("class", "name"); HasAttributeFilter remarkNameFilter = new HasAttributeFilter("class", "CH"); HasAttributeFilter linkUrlFilter = new HasAttributeFilter("class", "pf_lin S_link1"); HasAttributeFilter selfIntroFilter = new HasAttributeFilter("class", "pf_intro bsp"); HasAttributeFilter tagsFilter = new HasAttributeFilter("class", "S_func1"); HasAttributeFilter profileFilter = new HasAttributeFilter("class", "tags"); Lexer lexer = new Lexer(currentUserHtml); Parser parser = new Parser(lexer); //获取微博名 NodeList nickNameNodeList = parser.ExtractAllNodesThatMatch(nickNameFilter); if (nickNameNodeList.Size() == 1) { user.NickName = ((Span)nickNameNodeList[0]).ToPlainTextString(); } else { Console.WriteLine("判断微博名的标准出错!"); } //注意此处:如果要重复使用parser,一定要在本次使用“完”、下次使用前调用reset,否则会出错 parser.Reset(); //获取备注名称 NodeList remarkNameNodeList = parser.ExtractAllNodesThatMatch(remarkNameFilter); if (remarkNameNodeList.Size() == 1 && remarkNameNodeList[0].GetType().Equals(typeof(Span))) { string str = ((Span)remarkNameNodeList[0]).ToPlainTextString(); //去掉头尾的括号 user.RemarkName = str.Substring(1, str.Length - 2); } else { Console.WriteLine("判断微博备注名称的标准出错!"); } parser.Reset(); //获取微博链接地址 NodeList linkUrlNodeList = parser.ExtractAllNodesThatMatch(linkUrlFilter); if (linkUrlNodeList.Size() == 1 && linkUrlNodeList[0].GetType().Equals(typeof(ATag))) { user.LinkURL = ((ATag)linkUrlNodeList[0]).StringText; } else { Console.WriteLine("判断微博链接地址的标准出错!"); } parser.Reset(); //获取自我描述 NodeList selfIntroNodeList = parser.ExtractAllNodesThatMatch(selfIntroFilter); if (selfIntroNodeList.Size() == 1 && selfIntroNodeList[0].Children[1].GetType().Equals(typeof(Span))) { user.SelfIntroduction = ((Span)selfIntroNodeList[0].Children[1]).GetAttribute("TITLE"); } else { Console.WriteLine("判断自我描述的标准出错!"); } parser.Reset(); //获取标签 NodeList tagsNodeList = parser.ExtractAllNodesThatMatch(tagsFilter); string str2 = ""; for (int i = 0; i < tagsNodeList.Size(); i++) { if (tagsNodeList[i].GetType().Equals(typeof(Span))) { str2 += ((Span)tagsNodeList[i]).ToPlainTextString() + " "; } } user.Tags = str2; parser.Reset(); //获取属性信息 NodeList profileNodeList = parser.ExtractAllNodesThatMatch(profileFilter); if (profileNodeList.Size() == 1) { //通过分析发现,有用的信息均处于<a>标记中,所以按<a>标记取。然后再分析是其中的文本还是<em>中的title NodeClassFilter aTagFilter = new NodeClassFilter(typeof(ATag)); NodeList profileList = profileNodeList[0].Children.ExtractAllNodesThatMatch(aTagFilter, true); for (int j = 0; j < profileList.Size(); j++) { ATag aTag = (ATag)profileList[j]; if (aTag.Attributes.Contains("TITLE")) { user.Profile += aTag.GetAttribute("TITLE") + " "; } else { //遇到含有node-type="infoSlide"的节点说明所有属性遍历结束 if (aTag.Attributes.Contains("NODE-TYPE") && aTag.GetAttribute("NODE-TYPE").Equals("infoSlide")) { break; } else { //包含<em>子节点的情况 if (aTag.Children[0].GetType().Equals(typeof(TagNode))) { TagNode tagNode = (TagNode)aTag.Children[0]; user.Profile += tagNode.GetAttribute("TITLE") + " "; } else { //直接把<a>标记包含的文本输出 user.Profile += aTag.StringText + " "; } } } } } else { Console.WriteLine("判断用户属性信息的标准出错!"); } }
public NodeList GetPicturesForDetailHtml(NodeList result) { NodeFilter imgFilter = new NodeClassFilter(typeof(ImageTag)); XorFilter xorFilter = new XorFilter(); string[] s = new string[] { "http://a.tbcdn.cn/sys/common/icon/btn/add_to_share.png", "http://img04.taobaocdn.com/tps/i4/T1qU4sXiXxXXXXXXXX-114-25.png" }; for(int i=0;i<s.Length;i++){ if(i==0){ xorFilter = new XorFilter(imgFilter, new HasAttributeFilter("src", s[i])); }else{ xorFilter = new XorFilter(xorFilter, new HasAttributeFilter("src", s[i])); } } NodeList imgResult = result.ExtractAllNodesThatMatch(xorFilter, true); return imgResult; }