public static List <string> GetAnswerDivRichContent(this string s) { List <string> res = new List <string>(); Parser parser = Parser.CreateParser(s, "utf-8"); HtmlPage htmlPage = new HtmlPage(parser); parser.VisitAllNodesWith(htmlPage); HasAttributeFilter richContent = new HasAttributeFilter(); richContent.AttributeName = "class"; richContent.AttributeValue = "RichContent RichContent--unescapable"; HasAttributeFilter span = new HasAttributeFilter(); span.AttributeName = "class"; span.AttributeValue = "RichText CopyrightRichText-richText"; var divNodes = htmlPage.Body.ExtractAllNodesThatMatch(richContent, true); var spanNodes = divNodes.ExtractAllNodesThatMatch(span, true); //var spanChildrens = spanNodes[0].Children; //var noscriptNodes = spanChildrens.ExtractAllNodesThatMatch(new TagNameFilter("noscript")); //var noscriptImgNodes = noscriptNodes.ExtractAllNodesThatMatch(new TagNameFilter("img"), true); //for (int i = 0; i < noscriptNodes.Count; i++) //{ // var isRemove = spanChildrens.Remove(noscriptNodes[i]); //} var imgNodes = spanNodes.ExtractAllNodesThatMatch(new TagNameFilter("img"), true); // imgNodes = imgNodes.ExtractAllNodesThatMatch(,true); for (int i = 0; i < imgNodes.Count; i++) { var imageTag = (Winista.Text.HtmlParser.Tags.ImageTag)imgNodes[i]; var imgUrl = imageTag.GetAttribute("data-actualsrc"); if (!string.IsNullOrEmpty(imgUrl)) { res.Add(imgUrl); } } res = res.Distinct().ToList(); //Winista.Text.HtmlParser.Tags.ImageTag //List<string> res = new List<string>(); //var divimg = Regex.Match(s, _divImg).ToString(); //var imgs = Regex.Matches(divimg, _img).ToString(); //var imgCount = imgs.Count(); //for (int i = 0; i < imgCount; i++) //{ // var dataImg = imgs[i].ToString(); // var tempStr = Regex.Match(dataImg, _absoluteReg).ToString(); // res.Add(tempStr); //} return(res); }
static void GetStoryOfRevolution() { StreamReader reader = new StreamReader("catalogue.htm"); Lexer lexer = new Lexer(reader.ReadToEnd()); Parser parser = new Parser(lexer); HasAttributeFilter linkFilterByParent = new HasAttributeFilter("class", "row zhangjieUl"); HasAttributeFilter linkFilterByClass = new HasAttributeFilter("class", "fontStyle2 colorStyleLink"); AndFilter linkFilter = new AndFilter(new HasParentFilter(linkFilterByParent, true), linkFilterByClass); NodeList linkNodeList = parser.Parse(linkFilter); List <string> linkUrlList = new List <string>(linkNodeList.Size()); List <string> chapterHtmlContentList = new List <string>(linkNodeList.Size()); HttpWebRequest httpWebRequest; StreamReader chapterReader = null; for (int i = 0; i < linkNodeList.Size(); i++) { ATag linkNode = (ATag)linkNodeList[i]; linkUrlList.Add(linkNode.Link); httpWebRequest = HttpWebRequest.CreateHttp("http://www.mlxiaoshuo.com" + linkUrlList[linkUrlList.Count - 1]); chapterReader = new StreamReader(new BufferedStream(httpWebRequest.GetResponse().GetResponseStream(), 4 * 200 * 1024)); string chapterHtmlContent = chapterReader.ReadToEnd(); chapterHtmlContentList.Add(chapterHtmlContent); Console.WriteLine("第" + (i + 1) + "个页面获取完毕!"); } chapterReader.Close(); HasAttributeFilter praghFilter = new HasAttributeFilter("class", "textP fontStyle2 colorStyleText"); StreamWriter writer = new StreamWriter("革命逸事.txt"); for (int i = 0; i < chapterHtmlContentList.Count; i++) { writer.WriteLine("第" + (i + 1) + "章"); lexer = new Lexer(chapterHtmlContentList[i]); parser = new Parser(lexer); NodeList praghNodeList = parser.Parse(praghFilter); if (praghNodeList.Size() == 1) { for (int j = 0; j < praghNodeList[0].Children.Size(); j++) { if (praghNodeList[0].Children[j].GetType().Equals(typeof(ParagraphTag))) { ParagraphTag praghTag = (ParagraphTag)praghNodeList[0].Children[j]; writer.WriteLine(" " + praghTag.StringText); } } writer.WriteLine(); } else { Console.WriteLine("第" + (i + 1) + "页中,判断段落的标准出错!"); } } writer.Close(); }
public DataTable GetWXBySogou(string key, int count, DateTime time) { string baseurl = "http://weixin.sogou.com/weixin?type=2&query={0}&fr=sgsearch&ie=utf8&_ast=1433216256&_asf=null&w=01059900&cid=null&page={1}"; if (string.IsNullOrEmpty(key)) { return(null); } DataTable dt = GetStruct(key); DateTime cdate = DateTime.Now; for (int p = 0; p * 10 < count; p++) { string url = string.Format(baseurl, HttpUtility.UrlEncode(key), p + 1); string html = ieHelp.GetHtmlFromSite(url); HtmlPage page = htmlHelp.GetPage(html); //int cpage = GetCurPage(page.Body); if (cpage <= p) { break; } Winista.Text.HtmlParser.Util.NodeList nodes = page.Body.ExtractAllNodesThatMatch(new HasAttributeFilter("class", "wx-rb wx-rb3"), true); if (nodes.Count <= 0) { break; } //将其序列化为模型并存入相应类中 for (int i = 0; i < nodes.Count; i++) { Winista.Text.HtmlParser.Util.NodeList cnodes = nodes[i].Children; DataRow dr = dt.NewRow(); NodeFilter f_title = new AndFilter(new HasParentFilter(new TagNameFilter("h4")), new TagNameFilter("a")); ATag a = (ATag)cnodes.ExtractAllNodesThatMatch(f_title, true)[0]; dr["Title"] = a.StringText; dr["Link"] = a.Link; f_title = new AndFilter(new HasAttributeFilter("id", "weixin_account"), new TagNameFilter("a")); ATag author_a = (ATag)cnodes.ExtractAllNodesThatMatch(f_title, true)[0]; dr["Author"] = author_a.GetAttribute("title"); f_title = new HasAttributeFilter("class", "s-p"); Div div = (Div)cnodes.ExtractAllNodesThatMatch(f_title, true)[0]; string unixtime = div.GetAttribute("t"); dr["Cdate"] = GetDateTime(unixtime); dr["Day"] = GetDateTime(unixtime).Day; dr["Source"] = "微信"; dt.Rows.Add(dr); } if (cdate < time) { break; } } return(dt); }
//根据各种筛选条件,获取到需要的元素,后其看是否改为全Filter public string GetByFilter(string html, FilterModel model)//OR与AND都只能同时接受两个 { string result = ""; if (model.EType.ToLower().Equals("title")) { return(GetTitle(html)); } NodeList nodes = GetTagList(html, model.EType); if (!string.IsNullOrEmpty(model.ID)) { HasAttributeFilter filter = new HasAttributeFilter("id", model.ID); nodes = nodes.ExtractAllNodesThatMatch(filter); } if (!string.IsNullOrEmpty(model.CSS)) { HasAttributeFilter filter = new HasAttributeFilter("class", model.CSS); nodes = nodes.ExtractAllNodesThatMatch(filter); } if (!model.AllowScript) { TagNameFilter filter = new TagNameFilter("script"); nodes.ExtractAllNodesThatMatch(filter, true); } //将图片文件本地化 { TagNameFilter filter = new TagNameFilter("img"); NodeList imgs = nodes.ExtractAllNodesThatMatch(filter, true); for (int i = 0; i < imgs.Count; i++) { ImageTag img = imgs[i] as ImageTag; string savepath = function.VToP(vdir + Path.GetFileName(img.ImageURL)); if (File.Exists(savepath)) { continue; } //避免图片重复下载 img.ImageURL = httpHelp.DownFile(baseurl, img.ImageURL, savepath); } } result = nodes.AsHtml(); if (!string.IsNullOrWhiteSpace(model.Start) && !string.IsNullOrWhiteSpace(model.End)) { result = regHelper.GetValueBySE(result, model.Start, model.End); } return(result); }
/// <summary> /// 配置各种HTML节点过滤器 /// </summary> private static void MakeFilters() { HasAttributeFilter fansListFilterByClass = new HasAttributeFilter("class", "cnfList"); HasAttributeFilter fanListFilterByNodeType = new HasAttributeFilter("node-type", "userListBox"); AndFilter fansListFilter = new AndFilter(fanListFilterByNodeType, fansListFilterByClass); fanFilter = new AndFilter(new HasParentFilter(fansListFilter, false), new HasAttributeFilter("class", "clearfix S_line1")); HasAttributeFilter portraitFilterByParent = new HasAttributeFilter("class", "left"); portraitFilter = new AndFilter(new HasParentFilter(portraitFilterByParent, false), new HasAttributeFilter("class", "face mbspace")); HasAttributeFilter fanNameFilterByParent = new HasAttributeFilter("class", "con_left"); fanNameFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "name")); fanConnectFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "connect")); fanInfoFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "info")); followMethodFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "from W_textb")); }
private static void MakeFilters() { NodeClassFilter dlFilter = new NodeClassFilter(typeof(DefinitionList)); HasAttributeFilter searchListFilter = new HasAttributeFilter("id", "searchList"); poiListFilter = new AndFilter(new HasParentFilter(searchListFilter, false), dlFilter); poiFilter = new NodeClassFilter(typeof(DefinitionListBullet)); tasteFilter = new HasAttributeFilter("class", "score1"); environmentFilter = new HasAttributeFilter("class", "score2"); serviceFilter = new HasAttributeFilter("class", "score3"); averageFilter = new HasAttributeFilter("class", "average"); commentFilter = new AndFilter(new HasAttributeFilter("class", "B"), new HasAttributeFilter("module", "list-readreview")); HasAttributeFilter nameFilterByParent = new HasAttributeFilter("class", "shopname"); nameFilter = new AndFilter(new HasParentFilter(nameFilterByParent, false), new HasAttributeFilter("class", "BL")); addressFilter = new HasAttributeFilter("class", "address"); tagsFilter = new HasAttributeFilter("class", "tags"); }
public static string GetKeywords(string url) { string re = ""; try { CookieCollection cookies = new CookieCollection(); string html = new xkHttp().httpGET(url, ref cookies).ToLower(); NodeFilter filter = new HasAttributeFilter("name", "keywords"); NodeList htmlNodes = new Parser(new Lexer(html.ToLower())).Parse(filter); ITag t = (MetaTag)htmlNodes[0]; if (t.Attributes != null && t.Attributes.Count > 0) { re = t.Attributes["CONTENT"].ToString(); } } catch { } return(re); }
//分析HtmlContents中给定索引条目的内容,提取信息 private static void GetInfoFromHtml(int index) { //使用Winista.HtmlParser库解析HTML //建立HTML分析工具对象 Lexer lexer = new Lexer(HtmlContents[index]); Parser parser = new Parser(lexer); //按属性的过滤器:两个参数分别代表要过滤的属性和属性值 HasAttributeFilter nameFilter = new HasAttributeFilter("class", "lrg"); HasAttributeFilter priceFilter = new HasAttributeFilter("class", "bld lrg red"); //获得所有满足过滤条件的HTML节点 NodeList nameList = parser.Parse(nameFilter); for (int j = 0; j < nameList.Size(); j++) { //确定节点nameList[j]为Span类型的标签;HttpUtility.HtmlDecode方法把HTML编码转为文本编码,使中文正常显示 string name = HttpUtility.HtmlDecode(((Span)nameList[j]).StringText); //Parent表示该HTML节点的父节点 //NextSobling表示该HTML节点的下一个兄弟节点 //Children表示该HTML节点的所有孩子节点组成的集合 //ExtractAllNodesThatMatch表示获取所有满足给定过滤器条件的节点,两个参数分别代表过滤器和是否进入孩子节点中迭代查找 //注意:对Winista.HtmlParser来说,“空文本节点”也是一个节点(在IE的开发者工具中显示“空文本节点”,而Chrome则不显示);形似<del>内容</ del>在Children中会表达成三个节点 NodeList priceList = nameList[j].Parent.Parent.NextSibling.NextSibling.Children.ExtractAllNodesThatMatch(priceFilter, true); if (priceList.Size() == 1) { string priceStr = ((Span)priceList[0]).StringText; double price = Double.Parse(priceStr.Substring(2, priceStr.Length - 2)); TradeList.Add(new Commodity(name, price, "RMB")); } else { badRecordCount++; } } Console.WriteLine("第" + (index + 1) + "个页面处理完成!"); //保存当前页面到本地文件 //StreamWriter writer = new StreamWriter("searchresult"+i+".html"); //writer.Write(s); //writer.Close(); }
/// <summary> /// Creates the parse filter settings object with the default settings /// </summary> /// <returns>The parse filter to use for finding classes to extract </returns> public ParseFilterSettings() { ClassFilter = new HasAttributeFilter(typeof(ScriptObjectAttribute).FullName); EnumFilter = new HasAttributeFilter(typeof(ScriptEnumAttribute).FullName); ControllerFilter = new IsOfAnyTypeFilter("System.Web.Mvc.Controller", "Microsoft.AspNetCore.Mvc.Controller"); }
/// <summary> /// 从移动版微博中获取微博信息 /// </summary> /// <param name="index">要获取页面的页面序号</param> /// <param name="feedList">保存微博的Feed数组</param> public void GetInfoFromHtml(int index, List <Feed> feedList) { Lexer lexer = new Lexer(htmlContent); Parser parser = new Parser(lexer); //移动版网页中,爬取个人主页的微博,过滤出包含用户名称和信息的div HasAttributeFilter userFilter = new HasAttributeFilter("class", "u"); //移动版网页中,每条微博的div都含有class=c的属性 HasAttributeFilter feedFilter = new HasAttributeFilter("class", "c"); //移动版网页中,每条转发微博的第一个子div中都含有带class=c的属性的span标记 HasAttributeFilter refeedFilter = new HasAttributeFilter("class", "cmt"); //移动版网页中,每条微博内容都存于带class="ctt"的属性的span标记内 HasAttributeFilter feedContentFilter = new HasAttributeFilter("class", "ctt"); //移动版网页中,每条微博的发送时间和发送方式都存于带class="ct"的属性的span标记内 HasAttributeFilter feedTimeFilter = new HasAttributeFilter("class", "ct"); //在移动版网页中过滤出包含每条微博的转发理由的div。注意:内层的HasChildFilter只过滤出了包含文字“转发理由:”的span标记,所以需要再套一层HasChildFilter才能得到包含span标记的div HasChildFilter reFeedReasonFilter = new HasChildFilter(new HasChildFilter(new StringFilter("转发理由:"))); //若user.NickName为空,则说明是第一次爬取该个人主页的微博,需要获得用户信息 if (user.NickName.Equals("")) { #region 爬取个人主页的微博,首先获得用户信息 NodeList userNodeList = parser.Parse(userFilter); if (userNodeList.Size() == 1) { NodeList userDetailNodeList = userNodeList[0].Children.ExtractAllNodesThatMatch(feedContentFilter, true);//此处只是借用feedContentFilter过滤器,因为要过滤的节点正好符合这个过滤器 if (userDetailNodeList.Size() >= 2) { //获取微博用户名 if (userDetailNodeList[0].Children[0].GetType().Equals(typeof(TextNode))) { string nickName = ((TextNode)userDetailNodeList[0].Children[0]).ToPlainTextString(); //尝试把备注名提取出来 if (nickName.Contains("(")) { int start = nickName.IndexOf('('); int end = nickName.IndexOf(')'); if (end > start) { string remarkName = nickName.Substring(start + 1, end - start - 1); user.RemarkName = remarkName; } user.NickName = nickName.Substring(0, start); } else { user.NickName = nickName; } } else { Console.WriteLine("获取微博用户名出错!"); } //获取自我描述 user.SelfIntroduction = ((Span)userDetailNodeList[1]).StringText; } else { Console.WriteLine("获取包含微博用户名和自我描述的div出错!"); } } else { Console.WriteLine("获取包含微博用户信息的div出错!"); } //注意:重复使用parser前一定要调用Reset方法 parser.Reset(); #endregion } NodeList feedNodeList = parser.Parse(feedFilter); int count = 0; for (int i = 0; i < feedNodeList.Size(); i++) { //保存该条微博 Feed feed = new Feed(); feed.Page = index; feed.Number = i + 1; //记录微博条数 count++; //取得第i条微博的div; //把一个node转为具体的TagNode,以便取得其中的属性值 TagNode feedNode = (TagNode)feedNodeList[i]; //注意:获取某个属性的值时,作为键值的属性需要大写,如“ID” if (feedNode.Attributes.Contains("ID"))//若ID属性不存在,则说明不是这个节点不是微博内容 { //通过分析移动版网页可知, //每条微博的div中的一个子div中一般包含微博内容; //第二个子div包含图片和发送时间等 //若是转发微博,则有第三个子div,其中包含转发理由、转发来源和时间等 //第一个子div TagNode feedFirstDiv = (TagNode)feedNode.Children[0]; //找出包含转发微博的标记 NodeList reFeedList = feedFirstDiv.Children.ExtractAllNodesThatMatch(refeedFilter, true); if (reFeedList.Size() > 0) //实践表明,class="cmt"属性往往不止被转发微博所使用 { if (HttpUtility.HtmlDecode(((TextNode)reFeedList[0].Children[0]).ToPlainTextString()).Substring(0, 2).Equals("转发")) //为了保证取到的是转发微博的来源,故加这一条辅助判断 { feed.ReFeedOrNot = true; feed.OriginalAuthor = HttpUtility.HtmlDecode(((ATag)reFeedList[0].Children[1]).StringText); //找到包含转发理由的子div NodeList reFeedReasonList = feedNode.Children.ExtractAllNodesThatMatch(reFeedReasonFilter, true); if (reFeedReasonList.Size() == 1) { TagNode reFeedReasonDiv = (TagNode)reFeedReasonList[0]; //在包含转发理由的子div中,第一个子节点总为span标记,为文本“转发理由”四字 //第二个子节点开始的一些系列子节点组成保存转发理由的内容,可能有文本,有链接(@某人) //判断转发理由结束的几个条件:若为文本节点,则最后两个字符应为“//”;若为链接节点,则其文本应为“赞[X]”(或其链接为“http://weibo.cn/attitude/……”) for (int j = 1; j < reFeedReasonDiv.Children.Size(); j++) { Type t = reFeedReasonDiv.Children[j].GetType(); if (t.Equals(typeof(TextNode))) { string str = HttpUtility.HtmlDecode(((TextNode)reFeedReasonDiv.Children[j]).ToPlainTextString()); if (str.Length >= 2 && str.Substring(str.Length - 2, 2).Equals("//")) { feed.ReFeedReason += str.Substring(0, str.Length - 2); feed.ReFeedFrom = HttpUtility.HtmlDecode(((ATag)reFeedReasonDiv.Children[j + 1]).StringText); if (feed.ReFeedFrom.Substring(0, 1).Equals("@"))//去掉上一个转发者前的@符号 { feed.ReFeedFrom = feed.ReFeedFrom.Substring(1); } break; } else { feed.ReFeedReason += str; } continue; } if (t.Equals(typeof(ATag))) { string str = HttpUtility.HtmlDecode(((ATag)reFeedReasonDiv.Children[j]).StringText); if (str.Substring(0, 1).Equals("赞")) { feed.ReFeedFrom = feed.OriginalAuthor; break; } else { feed.ReFeedReason += str; } continue; } } } else { Console.WriteLine("好像找到不止一个转发理由?!"); } } else { Console.WriteLine("糟糕!第" + count + "条微博中,找不到转发微博的来源!"); } } //找出包含微博正文的标记 NodeList feedContentList = feedFirstDiv.Children.ExtractAllNodesThatMatch(feedContentFilter, true); switch (feedContentList.Size()) { case 1: //微博正文包含在一个span标记内 Span feedContentListNode = (Span)feedContentList[0]; //因为微博正文是不确定数量的文本和链接(如@某人)的组合,因此对于span的每个子节点,根据其类型(是文本节点还是链接节点),分别处理 for (int j = 0; j < feedContentListNode.Children.Size(); j++) { Type t = feedContentListNode.Children[j].GetType(); if (t.Equals(typeof(TextNode))) { feed.Content += HttpUtility.HtmlDecode(((TextNode)feedContentListNode.Children[j]).ToPlainTextString()); continue; } if (t.Equals(typeof(ATag))) { feed.Content += HttpUtility.HtmlDecode(((ATag)feedContentListNode.Children[j]).StringText); continue; } } break; default: Console.WriteLine("糟糕!第" + count + "条微博中,取得微博正文的判断标准出错了!"); break; } //从整个feed的范围内,找出包含微博发送时间的标记 NodeList feedTimeList = feedNode.Children.ExtractAllNodesThatMatch(feedTimeFilter, true); switch (feedTimeList.Size()) { case 1: string time = HttpUtility.HtmlDecode(((TextNode)((Span)feedTimeList[0]).Children[0]).ToHtml()); feed.Time = Program.GetTime(time); if (feedTimeList[0].Children.Size() > 1) { feed.Device = HttpUtility.HtmlDecode(((ATag)((Span)feedTimeList[0]).Children[1]).StringText); } //从包含微博发送时间的标记往前推,便是“赞”、“转发”和“评论”的标记 INode node = feedTimeList[0]; for (int j = 0; j < 9; j++) { node = node.PreviousSibling; switch (j) { case 4: //评论 string strCommentCount = ((ATag)node).StringText; feed.CommentCount = Int32.Parse(strCommentCount.Substring(3, strCommentCount.Length - 4)); break; case 6: //转发 string strReFeedCount = ((ATag)node).StringText; feed.ReFeedCount = Int32.Parse(strReFeedCount.Substring(3, strReFeedCount.Length - 4)); break; case 8: //赞 string strLikeCount = ((ATag)node).StringText; feed.LikeCount = Int32.Parse(strLikeCount.Substring(2, strLikeCount.Length - 3)); break; default: break; } } break; default: Console.WriteLine("糟糕!第" + count + "条微博中,取得微博时间的判断标准出错了!"); break; } feedList.Add(feed); } } }
/// <summary> /// Creates the parse filter settings object with the default settings /// </summary> /// <returns>The parse filter to use for finding classes to extract </returns> public ParseFilterSettings() { ClassFilter = new HasAttributeFilter(typeof(ScriptObjectAttribute).FullName); EnumFilter = new HasAttributeFilter(typeof(ScriptEnumAttribute).FullName); ControllerFilter = new IsOfAnyTypeFilter(MvcConstants.ControllerBaseFullName_AspNetCore); }