Example #1
0
 static void GetStoryOfRevolution()
 {
     StreamReader reader = new StreamReader("catalogue.htm");
     Lexer lexer = new Lexer(reader.ReadToEnd());
     Parser parser = new Parser(lexer);
     HasAttributeFilter linkFilterByParent = new HasAttributeFilter("class", "row zhangjieUl");
     HasAttributeFilter linkFilterByClass = new HasAttributeFilter("class", "fontStyle2 colorStyleLink");
     AndFilter linkFilter = new AndFilter(new HasParentFilter(linkFilterByParent, true), linkFilterByClass);
     NodeList linkNodeList = parser.Parse(linkFilter);
     List<string> linkUrlList = new List<string>(linkNodeList.Size());
     List<string> chapterHtmlContentList = new List<string>(linkNodeList.Size());
     HttpWebRequest httpWebRequest;
     StreamReader chapterReader = null;
     for (int i = 0; i < linkNodeList.Size(); i++)
     {
         ATag linkNode = (ATag)linkNodeList[i];
         linkUrlList.Add(linkNode.Link);
         httpWebRequest = HttpWebRequest.CreateHttp("http://www.mlxiaoshuo.com" + linkUrlList[linkUrlList.Count - 1]);
         chapterReader = new StreamReader(new BufferedStream(httpWebRequest.GetResponse().GetResponseStream(), 4 * 200 * 1024));
         string chapterHtmlContent = chapterReader.ReadToEnd();
         chapterHtmlContentList.Add(chapterHtmlContent);
         Console.WriteLine("第" + (i + 1) + "个页面获取完毕!");
     }
     chapterReader.Close();
     HasAttributeFilter praghFilter = new HasAttributeFilter("class", "textP fontStyle2 colorStyleText");
     StreamWriter writer = new StreamWriter("革命逸事.txt");
     for (int i = 0; i < chapterHtmlContentList.Count; i++)
     {
         writer.WriteLine("第" + (i + 1) + "章");
         lexer = new Lexer(chapterHtmlContentList[i]);
         parser = new Parser(lexer);
         NodeList praghNodeList = parser.Parse(praghFilter);
         if (praghNodeList.Size() == 1)
         {
             for (int j = 0; j < praghNodeList[0].Children.Size(); j++)
             {
                 if (praghNodeList[0].Children[j].GetType().Equals(typeof(ParagraphTag)))
                 {
                     ParagraphTag praghTag = (ParagraphTag)praghNodeList[0].Children[j];
                     writer.WriteLine("    " + praghTag.StringText);
                 }
             }
             writer.WriteLine();
         }
         else
         {
             Console.WriteLine("第" + (i + 1) + "页中,判断段落的标准出错!");
         }
     }
     writer.Close();
 }
        protected ArrayList getPaperReferenceByID(ArrayList paper_id)
        {
            string html_page = _HttpUtil.getPaperReferenceHTML(paper_id);

            if (html_page == null || html_page == "")
            {
                return null;
            }

            Parser p = new Parser(new Lexer(html_page));

            TagNameFilter tag_f = new TagNameFilter("div");
            HasAttributeFilter attr_f = new HasAttributeFilter("id", "export_container");

            AndFilter af = new AndFilter(tag_f, attr_f);

            NodeList childs = p.ExtractAllNodesThatMatch(af);

            if (childs == null || childs.Count <= 0)
            {
                return null;
            }

            INode node = childs[0];

            NodeList ref_childs = node.Children;
            ArrayList ref_list = new ArrayList();

            for (int i = 0; i < ref_childs.Count;++i )
            {
                INode tmp = ref_childs[i];

                if (tmp is ITag)
                {
                    ITag tag = tmp as ITag;

                    string str = tag.ToPlainTextString();

                    str = str.Replace('\r', ' ').Replace('\n',' ');

                    str = str.Substring(str.IndexOf(']') + 1);

                    //str = System.Text.RegularExpressions.Regex.Replace(str, @"^\[*\]$", "");

                    ref_list.Add(str);
                }
            }

            if (_Progressable != null)
            {
                _Progressable.onFinish(ref_list);
            }

            return ref_list;
        }
        protected string getPaperID(string paper_name)
        {
            string html_page = _HttpUtil.getPaperIDHTML(paper_name);

            if (html_page == null || html_page == "")
            {
                return null;
            }

            Parser p = new Parser(new Lexer(html_page));

            TagNameFilter tag_f = new TagNameFilter("A");
            HasAttributeFilter attr_f = new HasAttributeFilter("target", "_blank");
            HasChildFilter child_f = new HasChildFilter(new PaperFilter(paper_name));

            AndFilter af = new AndFilter(tag_f,attr_f);
            AndFilter aff = new AndFilter(af, child_f);

            NodeList childs = p.ExtractAllNodesThatMatch(aff);

            if (childs == null || childs.Count <= 0)
            {
                //Paper not found
                return null;
            }
            //TODO Multi Paper found

            INode node = childs[0];
            if (node is ITag)
            {
                ITag t = node as ITag;

                string href = t.GetAttribute("href");

                if (href != null && href != "")
                {
                    string [] sp = href.Split(new char[]{'/'});

                    return sp[sp.Length - 1].Split(new char[]{'.'})[0];
                }
            }

            //Not Found
            return null;
        }
 /// <summary>
 /// 配置各种HTML节点过滤器
 /// </summary>
 private static void MakeFilters()
 {
     HasAttributeFilter fansListFilterByClass = new HasAttributeFilter("class", "cnfList");
     HasAttributeFilter fanListFilterByNodeType = new HasAttributeFilter("node-type", "userListBox");
     AndFilter fansListFilter = new AndFilter(fanListFilterByNodeType, fansListFilterByClass);
     fanFilter = new AndFilter(new HasParentFilter(fansListFilter, false), new HasAttributeFilter("class", "clearfix S_line1"));
     HasAttributeFilter portraitFilterByParent = new HasAttributeFilter("class", "left");
     portraitFilter = new AndFilter(new HasParentFilter(portraitFilterByParent, false), new HasAttributeFilter("class", "face mbspace"));
     HasAttributeFilter fanNameFilterByParent = new HasAttributeFilter("class", "con_left");
     fanNameFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "name"));
     fanConnectFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "connect"));
     fanInfoFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "info"));
     followMethodFilter = new AndFilter(new HasParentFilter(fanNameFilterByParent, false), new HasAttributeFilter("class", "from W_textb"));
 }
		/// <summary>
		/// Gets all mail links in the page
		/// </summary>
		/// <returns></returns>
		public virtual NodeList GetAllMailLinks()
		{
			INodeFilter filter = new AndFilter(new NodeClassFilter(typeof(LinkTag)), new MailLinkFilter());
			return this.Parse(filter);
		}
Example #6
0
 private static void MakeFilters()
 {
     NodeClassFilter dlFilter = new NodeClassFilter(typeof(DefinitionList));
     HasAttributeFilter searchListFilter = new HasAttributeFilter("id", "searchList");
     poiListFilter = new AndFilter(new HasParentFilter(searchListFilter, false), dlFilter);
     poiFilter = new NodeClassFilter(typeof(DefinitionListBullet));
     tasteFilter = new HasAttributeFilter("class", "score1");
     environmentFilter = new HasAttributeFilter("class", "score2");
     serviceFilter = new HasAttributeFilter("class", "score3");
     averageFilter = new HasAttributeFilter("class", "average");
     commentFilter = new AndFilter(new HasAttributeFilter("class", "B"), new HasAttributeFilter("module", "list-readreview"));
     HasAttributeFilter nameFilterByParent = new HasAttributeFilter("class", "shopname");
     nameFilter = new AndFilter(new HasParentFilter(nameFilterByParent, false), new HasAttributeFilter("class", "BL"));
     addressFilter = new HasAttributeFilter("class", "address");
     tagsFilter = new HasAttributeFilter("class", "tags");
 }
Example #7
0
        public void GetFromWeb(IGetFromWebNotify notifier)
        {
            Directory.CreateDirectory(Config.ImagePath);

            if (notifier != null)
                notifier.Notity(String.Format("���� {0}", Config.Uri), 0.0f);
            WebClient webClient = new WebClient();
            webClient.Encoding = Encoding.UTF8;
            String strHtml = webClient.DownloadString(Config.Uri);

            if (notifier != null)
                notifier.Notity("����html�ĵ�", 0.0f);
            Lexer lexer = new Lexer(strHtml);
            Parser parser = new Parser(lexer);
            AndFilter andFilter = new AndFilter(new NodeClassFilter(typeof(TableRow)), new OrFilter(new HasAttributeFilter("class", "even"), new HasAttributeFilter("class", "odd")));
            NodeList htmlNodes = parser.ExtractAllNodesThatMatch(andFilter);
            lock (this)
            {
                m_Cards = new List<Card>();
                foreach (INode node in htmlNodes.ToNodeArray())
                {
                    int iFiledIndex = 0;
                    Card card = new Card();
                    foreach (INode subNode in node.Children.ToNodeArray())
                    {
                        if (subNode is TextNode)
                        {
                            continue;
                        }

                        switch (iFiledIndex)
                        {
                            case 0:
                                card.ID = Convert.ToInt32(subNode.FirstChild.GetText());
                                card.ImagePath = Path.Combine(Config.ImagePath, card.ID.ToString() + ".jpg");
                                break;
                            case 1:
                                card.Name = subNode.FirstChild.FirstChild.GetText();
                                break;
                            case 2:
                                StringHelper.FillCardLongInfo(subNode.FirstChild.GetText(), card);
                                break;
                            case 3:
                                if (subNode.FirstChild != null)
                                {
                                    card.ManaCost = subNode.FirstChild.GetText();
                                }
                                else
                                {
                                    card.ManaCost = String.Empty;
                                }
                                break;
                            case 4:
                                card.Rare = subNode.FirstChild.GetText();
                                break;
                        }

                        iFiledIndex++;
                    }
                    m_Cards.Add(card);
                }
            }

            XmlSerializer s = new XmlSerializer(typeof(List<Card>));
            FileStream fstream = new FileStream(Config.CardsXml, FileMode.CreateNew);
            s.Serialize(fstream, m_Cards);
            fstream.Close();

            foreach (Card card in m_Cards)
            {
                if (notifier != null)
                    notifier.Notity(String.Format("��ȡ��Ƭ\"{0}\"��Ϣ", card.Name), 1.0f / m_Cards.Count);
                webClient.DownloadFile(Path.Combine(Config.BaseImageUri, card.ID.ToString() + ".jpg"), card.ImagePath);
            }
        }
Example #8
0
 /// <summary>
 /// 给定根节点,返回微博发送设备信息
 /// </summary>
 /// <param name="i">微博在所在页面中的流水号</param>
 /// <param name="feedDiv">包含微博的div标记</param>
 /// <returns>返回微博发送设备信息</returns>
 private string GetFeedSendTypeInfo(int i, INode feedDiv)
 {
     string result = "";
     NodeList feedSendTypeNodeList = feedDiv.Children.ExtractAllNodesThatMatch(feedSendTypeFilter, true);
     if (feedSendTypeNodeList.Size() == 1)
     {
         result = ((ATag)(feedSendTypeNodeList[0])).StringText;
     }
     else
     {
         //某些情况下,会显示“来自未经审核的应用”
         AndFilter fromFilter = new AndFilter(new HasParentFilter(feedFromFilter, true), new NodeClassFilter(typeof(TextNode)));
         NodeList textNodeList = feedDiv.Children.ExtractAllNodesThatMatch(fromFilter, true);
         for (int j = 0; j < textNodeList.Size(); j++)
         {
             if (textNodeList[j].ToPlainTextString().Equals("来自"))
             {
                 if (j < textNodeList.Size())//以防万一出现存在“来自”字符串而没有设备字符串的奇葩情况……
                 {
                     result = textNodeList[j + 1].ToPlainTextString();
                 }
                 break;
             }
         }
         if (result.Equals(""))
         {
             Console.WriteLine("第" + i + "条微博中,微博发送设备为空");
         }
     }
     char[] shouldRemove = { ' ', (char)10, '\r', '\n' };
     result = result.TrimStart(shouldRemove);
     result = result.TrimEnd(shouldRemove);
     return result;
 }
Example #9
0
 /// <summary>
 /// 配置各种HTML节点过滤器
 /// </summary>
 private static void MakeFilters()
 {
     //爬取个人主页时,使用如下过滤器得到包含mid属性的div;mid和maid以及endid相关
     idFilter = new List<HasAttributeFilter>();
     idFilter.Add(new HasAttributeFilter("class", "WB_feed_type SW_fun  "));
     //过滤出每条微博的div
     feedFilter = new HasAttributeFilter("class", "WB_feed_datail S_line2 clearfix");
     idFilter.Add(feedFilter);
     //过滤出包含微博发送者的div:因为转发微博的div也包含属性class="WB_info",所以使用两个过滤器更为可靠
     HasAttributeFilter wbDetailFilter = new HasAttributeFilter("class", "WB_detail");
     feedAuthorFilter = new AndFilter(new HasAttributeFilter("class", "WB_info"), new HasParentFilter(wbDetailFilter, false));
     //过滤出包含微博内容的div:因为转发微博的div也包含属性class="WB_text",所以使用两个过滤器更为可靠
     feedContentFilter = new AndFilter(new HasAttributeFilter("class", "WB_text"), new HasAttributeFilter("node-type", "feed_list_content"));
     //过滤出包含转发微博的div
     reFeedFilter = new HasAttributeFilter("node-type", "feed_list_forwardContent");
     //过滤出转发微博的原发送者的div:因为类似的原因,所以需要两个过滤器
     reFeedAuthorFilter = new AndFilter(new HasAttributeFilter("class", "WB_info"), new HasParentFilter(reFeedFilter, true));
     //过滤出转发微博的内容:因为类似的原因,所以需要两个过滤器
     reFeedContentFilter = new AndFilter(new HasAttributeFilter("class", "WB_text"), new HasAttributeFilter("node-type", "feed_list_reason"));
     //过滤出已被删除的转发微博(适用于该div位于reFeedFilter过滤出的div下的情况)
     refeedDeletedFilter1 = new HasAttributeFilter("class", "WB_deltxt");
     //过滤出已被删除的转发微博(适用于该div位于<div class="WB_datail">下的情况)
     refeedDeletedFilter2 = new AndFilter(new HasParentFilter(wbDetailFilter, true), refeedDeletedFilter1);
     //过滤出包含对原微博转发数的<b>标记
     similarFeedCountFilter = new AndFilter(new HasAttributeFilter("class", "S_spetxt"), new HasAttributeFilter("node-type", "followNum"));
     //过滤出包含对原微博类似转发的标记
     HasAttributeFilter similarFeedFilterByParent = new HasAttributeFilter("class", "WB_feed_datail S_line2 clearfix WB_feed_noLine");
     similarFeedFilter = new AndFilter(wbDetailFilter, new HasParentFilter(similarFeedFilterByParent, false));
     //过滤出包含微博发送地点的div
     feedLocationFilter = new AndFilter(new HasAttributeFilter("class", "map_data"), new HasParentFilter(wbDetailFilter, false));
     //过滤出包含微博发送时间、发送方式、转发数和评论数的div
     AndFilter feedMetaDataFilter = new AndFilter(new NotFilter(new HasParentFilter(new HasAttributeFilter("class", "WB_media_expand SW_fun2 S_line1 S_bg1"), true)), new HasAttributeFilter("class", "WB_func clearfix"));
     //过滤出包含转发数和评论数的div
     AndFilter feedHandleFilter = new AndFilter(new HasParentFilter(feedMetaDataFilter, false), new HasAttributeFilter("class", "WB_handle"));
     //过滤出包含发送时间和发送方式的div
     feedFromFilter = new AndFilter(new HasParentFilter(feedMetaDataFilter, false), new HasAttributeFilter("class", "WB_from"));
     //过滤出包含“赞”数的链接标记
     feedLikeFilter = new AndFilter(new HasParentFilter(feedHandleFilter, false), new HasAttributeFilter("action-type", "fl_like"));
     //过滤出包含转发数的链接标记
     feedForwardFilter = new AndFilter(new HasParentFilter(feedHandleFilter, false), new HasAttributeFilter("action-type", "fl_forward"));
     //过滤出包含评论数的链接标记
     feedCommentFilter = new AndFilter(new HasParentFilter(feedHandleFilter, false), new HasAttributeFilter("action-type", "fl_comment"));
     //过滤出包含微博发送时间的链接标记
     feedTimeFilter = new AndFilter(new HasParentFilter(feedFromFilter, false), new HasAttributeFilter("class", "S_link2 WB_time"));
     //过滤出包含微博发送方式的链接标记
     feedSendTypeFilter = new AndFilter(new HasParentFilter(feedFromFilter, false), new HasAttributeFilter("class", "S_link2"));
 }
Example #10
0
        public void GetLinkForPage(string url)
        {
            Lexer lexer = new Lexer(GetHtml(url));
            Parser parse = new Parser(lexer);
            parse.Encoding = "gb2312";
            NodeFilter linkFilter = new LinkRegexFilter(@"^http\://item\.taobao\.com/item\.htm\?id\=\d+$");
            NodeFilter classFilter = new HasAttributeFilter("class", "EventCanSelect");
            AndFilter andFilter = new AndFilter(linkFilter, classFilter);
            NodeList result = parse.Parse(andFilter);

            int length = result.Count;
            for (int i = 0; i < length; i++)
            {
                ItemLink.Add(result[i]);
            }
        }
Example #11
0
        public NodeList GetDetailPageForHtml(string html)
        {
            Parser parse = GetParser(html);

            NodeFilter showidFilter = new HasAttributeFilter("id", "detail");
            NodeFilter showclassFilter = new HasAttributeFilter("class", "box");
            AndFilter showFilter = new AndFilter(showidFilter, showclassFilter);

            NodeFilter contentidFilter = new HasAttributeFilter("id", "J_DivItemDesc");
            NodeFilter contentclassFilter = new HasAttributeFilter("class", "content");
            AndFilter contentFilter = new AndFilter(contentidFilter, contentclassFilter);

            OrFilter orFitler = new OrFilter(showFilter, contentFilter);

            return parse.Parse(orFitler);
        }