static void GetStoryOfRevolution() { StreamReader reader = new StreamReader("catalogue.htm"); Lexer lexer = new Lexer(reader.ReadToEnd()); Parser parser = new Parser(lexer); HasAttributeFilter linkFilterByParent = new HasAttributeFilter("class", "row zhangjieUl"); HasAttributeFilter linkFilterByClass = new HasAttributeFilter("class", "fontStyle2 colorStyleLink"); AndFilter linkFilter = new AndFilter(new HasParentFilter(linkFilterByParent, true), linkFilterByClass); NodeList linkNodeList = parser.Parse(linkFilter); List<string> linkUrlList = new List<string>(linkNodeList.Size()); List<string> chapterHtmlContentList = new List<string>(linkNodeList.Size()); HttpWebRequest httpWebRequest; StreamReader chapterReader = null; for (int i = 0; i < linkNodeList.Size(); i++) { ATag linkNode = (ATag)linkNodeList[i]; linkUrlList.Add(linkNode.Link); httpWebRequest = HttpWebRequest.CreateHttp("http://www.mlxiaoshuo.com" + linkUrlList[linkUrlList.Count - 1]); chapterReader = new StreamReader(new BufferedStream(httpWebRequest.GetResponse().GetResponseStream(), 4 * 200 * 1024)); string chapterHtmlContent = chapterReader.ReadToEnd(); chapterHtmlContentList.Add(chapterHtmlContent); Console.WriteLine("第" + (i + 1) + "个页面获取完毕!"); } chapterReader.Close(); HasAttributeFilter praghFilter = new HasAttributeFilter("class", "textP fontStyle2 colorStyleText"); StreamWriter writer = new StreamWriter("革命逸事.txt"); for (int i = 0; i < chapterHtmlContentList.Count; i++) { writer.WriteLine("第" + (i + 1) + "章"); lexer = new Lexer(chapterHtmlContentList[i]); parser = new Parser(lexer); NodeList praghNodeList = parser.Parse(praghFilter); if (praghNodeList.Size() == 1) { for (int j = 0; j < praghNodeList[0].Children.Size(); j++) { if (praghNodeList[0].Children[j].GetType().Equals(typeof(ParagraphTag))) { ParagraphTag praghTag = (ParagraphTag)praghNodeList[0].Children[j]; writer.WriteLine(" " + praghTag.StringText); } } writer.WriteLine(); } else { Console.WriteLine("第" + (i + 1) + "页中,判断段落的标准出错!"); } } writer.Close(); }
protected ArrayList getPaperReferenceByID(ArrayList paper_id) { string html_page = _HttpUtil.getPaperReferenceHTML(paper_id); if (html_page == null || html_page == "") { return null; } Parser p = new Parser(new Lexer(html_page)); TagNameFilter tag_f = new TagNameFilter("div"); HasAttributeFilter attr_f = new HasAttributeFilter("id", "export_container"); AndFilter af = new AndFilter(tag_f, attr_f); NodeList childs = p.ExtractAllNodesThatMatch(af); if (childs == null || childs.Count <= 0) { return null; } INode node = childs[0]; NodeList ref_childs = node.Children; ArrayList ref_list = new ArrayList(); for (int i = 0; i < ref_childs.Count;++i ) { INode tmp = ref_childs[i]; if (tmp is ITag) { ITag tag = tmp as ITag; string str = tag.ToPlainTextString(); str = str.Replace('\r', ' ').Replace('\n',' '); str = str.Substring(str.IndexOf(']') + 1); //str = System.Text.RegularExpressions.Regex.Replace(str, @"^\[*\]$", ""); ref_list.Add(str); } } if (_Progressable != null) { _Progressable.onFinish(ref_list); } return ref_list; }
protected string getPaperID(string paper_name) { string html_page = _HttpUtil.getPaperIDHTML(paper_name); if (html_page == null || html_page == "") { return null; } Parser p = new Parser(new Lexer(html_page)); TagNameFilter tag_f = new TagNameFilter("A"); HasAttributeFilter attr_f = new HasAttributeFilter("target", "_blank"); HasChildFilter child_f = new HasChildFilter(new PaperFilter(paper_name)); AndFilter af = new AndFilter(tag_f,attr_f); AndFilter aff = new AndFilter(af, child_f); NodeList childs = p.ExtractAllNodesThatMatch(aff); if (childs == null || childs.Count <= 0) { //Paper not found return null; } //TODO Multi Paper found INode node = childs[0]; if (node is ITag) { ITag t = node as ITag; string href = t.GetAttribute("href"); if (href != null && href != "") { string [] sp = href.Split(new char[]{'/'}); return sp[sp.Length - 1].Split(new char[]{'.'})[0]; } } //Not Found return null; }
/// <summary> /// 配置各种HTML节点过滤器 /// </summary> private static void MakeFilters() { HasAttributeFilter fansListFilterByClass = new HasAttributeFilter("class", "cnfList"); HasAttributeFilter fanListFilterByNodeType = new HasAttributeFilter("node-type", "userListBox"); AndFilter fansListFilter = new AndFilter(fanListFilterByNodeType, fansListFilterByClass); fanFilter = new AndFilter(new HasParentFilter(fansListFilter, false), new HasAttributeFilter("class", "clearfix S_line1")); HasAttributeFilter portraitFilterByParent = new HasAttributeFilter("class", "left"); portraitFilter = new AndFilter(new HasParentFilter(portraitFilterByParent, false), new HasAttributeFilter("class", "face mbspace")); HasAttributeFilter fanNameFilterByParent = new HasAttributeFilter("class", "con_left"); fanNameFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "name")); fanConnectFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "connect")); fanInfoFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "info")); followMethodFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "from W_textb")); }
/// <summary> /// Gets all mail links in the page /// </summary> /// <returns></returns> public virtual NodeList GetAllMailLinks() { INodeFilter filter = new AndFilter(new NodeClassFilter(typeof(LinkTag)), new MailLinkFilter()); return this.Parse(filter); }
private static void MakeFilters() { NodeClassFilter dlFilter = new NodeClassFilter(typeof(DefinitionList)); HasAttributeFilter searchListFilter = new HasAttributeFilter("id", "searchList"); poiListFilter = new AndFilter(new HasParentFilter(searchListFilter, false), dlFilter); poiFilter = new NodeClassFilter(typeof(DefinitionListBullet)); tasteFilter = new HasAttributeFilter("class", "score1"); environmentFilter = new HasAttributeFilter("class", "score2"); serviceFilter = new HasAttributeFilter("class", "score3"); averageFilter = new HasAttributeFilter("class", "average"); commentFilter = new AndFilter(new HasAttributeFilter("class", "B"), new HasAttributeFilter("module", "list-readreview")); HasAttributeFilter nameFilterByParent = new HasAttributeFilter("class", "shopname"); nameFilter = new AndFilter(new HasParentFilter(nameFilterByParent, false), new HasAttributeFilter("class", "BL")); addressFilter = new HasAttributeFilter("class", "address"); tagsFilter = new HasAttributeFilter("class", "tags"); }
public void GetFromWeb(IGetFromWebNotify notifier) { Directory.CreateDirectory(Config.ImagePath); if (notifier != null) notifier.Notity(String.Format("���� {0}", Config.Uri), 0.0f); WebClient webClient = new WebClient(); webClient.Encoding = Encoding.UTF8; String strHtml = webClient.DownloadString(Config.Uri); if (notifier != null) notifier.Notity("����html�ĵ�", 0.0f); Lexer lexer = new Lexer(strHtml); Parser parser = new Parser(lexer); AndFilter andFilter = new AndFilter(new NodeClassFilter(typeof(TableRow)), new OrFilter(new HasAttributeFilter("class", "even"), new HasAttributeFilter("class", "odd"))); NodeList htmlNodes = parser.ExtractAllNodesThatMatch(andFilter); lock (this) { m_Cards = new List<Card>(); foreach (INode node in htmlNodes.ToNodeArray()) { int iFiledIndex = 0; Card card = new Card(); foreach (INode subNode in node.Children.ToNodeArray()) { if (subNode is TextNode) { continue; } switch (iFiledIndex) { case 0: card.ID = Convert.ToInt32(subNode.FirstChild.GetText()); card.ImagePath = Path.Combine(Config.ImagePath, card.ID.ToString() + ".jpg"); break; case 1: card.Name = subNode.FirstChild.FirstChild.GetText(); break; case 2: StringHelper.FillCardLongInfo(subNode.FirstChild.GetText(), card); break; case 3: if (subNode.FirstChild != null) { card.ManaCost = subNode.FirstChild.GetText(); } else { card.ManaCost = String.Empty; } break; case 4: card.Rare = subNode.FirstChild.GetText(); break; } iFiledIndex++; } m_Cards.Add(card); } } XmlSerializer s = new XmlSerializer(typeof(List<Card>)); FileStream fstream = new FileStream(Config.CardsXml, FileMode.CreateNew); s.Serialize(fstream, m_Cards); fstream.Close(); foreach (Card card in m_Cards) { if (notifier != null) notifier.Notity(String.Format("��ȡ��Ƭ\"{0}\"��Ϣ", card.Name), 1.0f / m_Cards.Count); webClient.DownloadFile(Path.Combine(Config.BaseImageUri, card.ID.ToString() + ".jpg"), card.ImagePath); } }
/// <summary> /// 给定根节点,返回微博发送设备信息 /// </summary> /// <param name="i">微博在所在页面中的流水号</param> /// <param name="feedDiv">包含微博的div标记</param> /// <returns>返回微博发送设备信息</returns> private string GetFeedSendTypeInfo(int i, INode feedDiv) { string result = ""; NodeList feedSendTypeNodeList = feedDiv.Children.ExtractAllNodesThatMatch(feedSendTypeFilter, true); if (feedSendTypeNodeList.Size() == 1) { result = ((ATag)(feedSendTypeNodeList[0])).StringText; } else { //某些情况下,会显示“来自未经审核的应用” AndFilter fromFilter = new AndFilter(new HasParentFilter(feedFromFilter, true), new NodeClassFilter(typeof(TextNode))); NodeList textNodeList = feedDiv.Children.ExtractAllNodesThatMatch(fromFilter, true); for (int j = 0; j < textNodeList.Size(); j++) { if (textNodeList[j].ToPlainTextString().Equals("来自")) { if (j < textNodeList.Size())//以防万一出现存在“来自”字符串而没有设备字符串的奇葩情况…… { result = textNodeList[j + 1].ToPlainTextString(); } break; } } if (result.Equals("")) { Console.WriteLine("第" + i + "条微博中,微博发送设备为空"); } } char[] shouldRemove = { ' ', (char)10, '\r', '\n' }; result = result.TrimStart(shouldRemove); result = result.TrimEnd(shouldRemove); return result; }
/// <summary> /// 配置各种HTML节点过滤器 /// </summary> private static void MakeFilters() { //爬取个人主页时,使用如下过滤器得到包含mid属性的div;mid和maid以及endid相关 idFilter = new List<HasAttributeFilter>(); idFilter.Add(new HasAttributeFilter("class", "WB_feed_type SW_fun ")); //过滤出每条微博的div feedFilter = new HasAttributeFilter("class", "WB_feed_datail S_line2 clearfix"); idFilter.Add(feedFilter); //过滤出包含微博发送者的div:因为转发微博的div也包含属性class="WB_info",所以使用两个过滤器更为可靠 HasAttributeFilter wbDetailFilter = new HasAttributeFilter("class", "WB_detail"); feedAuthorFilter = new AndFilter(new HasAttributeFilter("class", "WB_info"), new HasParentFilter(wbDetailFilter, false)); //过滤出包含微博内容的div:因为转发微博的div也包含属性class="WB_text",所以使用两个过滤器更为可靠 feedContentFilter = new AndFilter(new HasAttributeFilter("class", "WB_text"), new HasAttributeFilter("node-type", "feed_list_content")); //过滤出包含转发微博的div reFeedFilter = new HasAttributeFilter("node-type", "feed_list_forwardContent"); //过滤出转发微博的原发送者的div:因为类似的原因,所以需要两个过滤器 reFeedAuthorFilter = new AndFilter(new HasAttributeFilter("class", "WB_info"), new HasParentFilter(reFeedFilter, true)); //过滤出转发微博的内容:因为类似的原因,所以需要两个过滤器 reFeedContentFilter = new AndFilter(new HasAttributeFilter("class", "WB_text"), new HasAttributeFilter("node-type", "feed_list_reason")); //过滤出已被删除的转发微博(适用于该div位于reFeedFilter过滤出的div下的情况) refeedDeletedFilter1 = new HasAttributeFilter("class", "WB_deltxt"); //过滤出已被删除的转发微博(适用于该div位于<div class="WB_datail">下的情况) refeedDeletedFilter2 = new AndFilter(new HasParentFilter(wbDetailFilter, true), refeedDeletedFilter1); //过滤出包含对原微博转发数的<b>标记 similarFeedCountFilter = new AndFilter(new HasAttributeFilter("class", "S_spetxt"), new HasAttributeFilter("node-type", "followNum")); //过滤出包含对原微博类似转发的标记 HasAttributeFilter similarFeedFilterByParent = new HasAttributeFilter("class", "WB_feed_datail S_line2 clearfix WB_feed_noLine"); similarFeedFilter = new AndFilter(wbDetailFilter, new HasParentFilter(similarFeedFilterByParent, false)); //过滤出包含微博发送地点的div feedLocationFilter = new AndFilter(new HasAttributeFilter("class", "map_data"), new HasParentFilter(wbDetailFilter, false)); //过滤出包含微博发送时间、发送方式、转发数和评论数的div AndFilter feedMetaDataFilter = new AndFilter(new NotFilter(new HasParentFilter(new HasAttributeFilter("class", "WB_media_expand SW_fun2 S_line1 S_bg1"), true)), new HasAttributeFilter("class", "WB_func clearfix")); //过滤出包含转发数和评论数的div AndFilter feedHandleFilter = new AndFilter(new HasParentFilter(feedMetaDataFilter, false), new HasAttributeFilter("class", "WB_handle")); //过滤出包含发送时间和发送方式的div feedFromFilter = new AndFilter(new HasParentFilter(feedMetaDataFilter, false), new HasAttributeFilter("class", "WB_from")); //过滤出包含“赞”数的链接标记 feedLikeFilter = new AndFilter(new HasParentFilter(feedHandleFilter, false), new HasAttributeFilter("action-type", "fl_like")); //过滤出包含转发数的链接标记 feedForwardFilter = new AndFilter(new HasParentFilter(feedHandleFilter, false), new HasAttributeFilter("action-type", "fl_forward")); //过滤出包含评论数的链接标记 feedCommentFilter = new AndFilter(new HasParentFilter(feedHandleFilter, false), new HasAttributeFilter("action-type", "fl_comment")); //过滤出包含微博发送时间的链接标记 feedTimeFilter = new AndFilter(new HasParentFilter(feedFromFilter, false), new HasAttributeFilter("class", "S_link2 WB_time")); //过滤出包含微博发送方式的链接标记 feedSendTypeFilter = new AndFilter(new HasParentFilter(feedFromFilter, false), new HasAttributeFilter("class", "S_link2")); }
public void GetLinkForPage(string url) { Lexer lexer = new Lexer(GetHtml(url)); Parser parse = new Parser(lexer); parse.Encoding = "gb2312"; NodeFilter linkFilter = new LinkRegexFilter(@"^http\://item\.taobao\.com/item\.htm\?id\=\d+$"); NodeFilter classFilter = new HasAttributeFilter("class", "EventCanSelect"); AndFilter andFilter = new AndFilter(linkFilter, classFilter); NodeList result = parse.Parse(andFilter); int length = result.Count; for (int i = 0; i < length; i++) { ItemLink.Add(result[i]); } }
public NodeList GetDetailPageForHtml(string html) { Parser parse = GetParser(html); NodeFilter showidFilter = new HasAttributeFilter("id", "detail"); NodeFilter showclassFilter = new HasAttributeFilter("class", "box"); AndFilter showFilter = new AndFilter(showidFilter, showclassFilter); NodeFilter contentidFilter = new HasAttributeFilter("id", "J_DivItemDesc"); NodeFilter contentclassFilter = new HasAttributeFilter("class", "content"); AndFilter contentFilter = new AndFilter(contentidFilter, contentclassFilter); OrFilter orFitler = new OrFilter(showFilter, contentFilter); return parse.Parse(orFitler); }