Lexer, Winista.Text.HtmlParser.Lex C# (CSharp) 코드 예제들

예제 #1

0

파일 보기

파일: Spider.cs 프로젝트: Letractively/swift-test-one

 /// <summary>
 /// 获取目标数据
 /// </summary>
 /// <param name="parser">目标html文件</param>
 /// <param name="tag">标签名称</param>
 /// <param name="attribute">标签里面的属性名称</param>
 /// <param name="attValue">属性的值</param>
 /// <returns>标签内的目标数据</returns>
 public static string getValue(string html, string tag, string attribute, string attValue)
 {
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     string value = string.Empty;
     NodeFilter nodeFilter = new TagNameFilter(tag);
     NodeList nodeList = parser.Parse(nodeFilter);
     for (int i = 0; i < nodeList.Count; i++)
     {
         INode node = nodeList[i];
         ITag tagNode = (node as ITag);
         if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
         {
             foreach (string key in tagNode.Attributes.Keys)
             {
                 if (key.Contains("<TAGNAME>"))
                 {
                     continue;
                 }
                 if (key.Contains(attribute))
                 {
                     if (tagNode.Attributes[key].ToString() == attValue)
                     {
                         value = tagNode.ToPlainTextString();
                         return value;
                     }
                 }
             }
         }
     }
     return null;
 }

예제 #2

0

파일 보기

파일: Program.cs 프로젝트: CaseyYang/WebProjects

 static void GetBlogLink(string htmlContent)
 {
     Lexer lexer = new Lexer(htmlContent);
     Parser parser = new Parser(lexer);
     NodeList articleList = parser.Parse(articleFilter);
     if (articleList.Count == 1)
     {
         NodeList candidateNodeList = articleList[0].Children.ExtractAllNodesThatMatch(wrapFilter, true);
         for (int i = 0; i < candidateNodeList.Count; i++)
         {
             NodeList linkNodeList = candidateNodeList[i].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), false);
             if (linkNodeList.Count == 1)
             {
                 string blogLink = ((ATag)linkNodeList[0]).ExtractLink();
                 blogLinkList.Add(blogLink);
             }
             else
             {
                 Console.WriteLine("第" + i + "个条目中，判断链接出错！");
             }
         }
     }
     else
     {
         Console.WriteLine("获取包含日志列表出错！");
     }
 }

예제 #3

0

파일 보기

파일: HtmlHandle.cs 프로젝트: czl032405/MKHXHHEHEHE

        public static List<Product> LoadGoods(string html)
        {
            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);
            NodeFilter filter = new HasAttributeFilter("class", "product");
            NodeList products = parser.ExtractAllNodesThatMatch(filter);

            List<Product> result = new List<Product>();
            for (int i = 0; i < products.Count; i++)
            {
                try
                {
                    Product p = new Product();
                    string pname = "", ppromo = "", pimg = "";
                    decimal pprice = 0;
                    ITag product = products[i] as ITag;

                    //name
                    NodeFilter nameFilter = new HasAttributeFilter("class", "product-title");
                    NodeList names = product.Children.ExtractAllNodesThatMatch(nameFilter, true);
                    ITag name = names[0] as ITag;
                    pname = name.ToPlainTextString().Trim();

                    //name
                    NodeFilter priceFilter = new HasAttributeFilter("class", "product-price");
                    NodeList prices = product.Children.ExtractAllNodesThatMatch(priceFilter, true);
                    ITag price = prices[0] as ITag;
                    pprice = Decimal.Parse(price.ToPlainTextString().Trim().Substring(7));

                    //img
                    NodeFilter imgFilter = new TagNameFilter("img");
                    NodeList imgs = product.Children.ExtractAllNodesThatMatch(imgFilter, true);
                    ITag img = imgs[0] as ITag;
                    pimg = img.GetAttribute("DATA-KS-LAZYLOAD");

                    //promo
                    NodeFilter promoFilter = new HasAttributeFilter("class", "promo");
                    NodeList promos = product.Children.ExtractAllNodesThatMatch(promoFilter, true);
                    if (promos.Count > 0)
                    {
                        ITag promo = promos[0] as ITag;
                        ppromo = promo.GetAttribute("data-promo");
                    }

                    p.img = pimg;
                    p.name = pname;
                    p.price = pprice;
                    p.promo = ppromo;
                    result.Add(p);
                }
                catch
                {

                }

            }

            return result;
        }

예제 #4

0

파일 보기

파일: HistoryOddsBLL.cs 프로젝트: opo30/bet-helper

        /// <summary>
        /// 增加一条数据
        /// </summary>
        public string Add(string scheduleID, string companyids, string historyids,DateTime time)
        {
            WebClientBLL bll = new WebClientBLL();
            string[] companyidArr = companyids.Split(',');
            string[] historyidArr = historyids.Split(',');
            int count = 0;
            if (companyidArr.Length == historyidArr.Length)
            {
                dal.Delete(scheduleID);
                for (int i = 0; i < companyidArr.Length; i++)
                {
                    string s = bll.GetOddsHistoryContent(historyidArr[i]);

                    Lexer lexer = new Lexer(s);
                    Parser parser = new Parser(lexer);
                    NodeList bodyNodes = parser.Parse(new TagNameFilter("HTML"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("BODY"))[0].Children;
                    ITag table = bodyNodes.SearchFor(typeof(Winista.Text.HtmlParser.Tags.TableTag))[0] as ITag;

                    NodeList tableRows = table.Children.SearchFor(typeof(Winista.Text.HtmlParser.Tags.TableRow));

                    for (int f = 0; f < tableRows.Count; f++)
                    {
                        ITag row = tableRows[f] as ITag;
                        if (row.Attributes["ALIGN"].Equals("center") && row.Attributes["BGCOLOR"].Equals("#FFFFFF")){
                            Odds1x2History model = new Odds1x2History();
                            model.companyid = int.Parse(companyidArr[i]);
                            model.scheduleid = int.Parse(scheduleID);
                            model.home = float.Parse(row.Children[0].ToPlainTextString());
                            model.draw = float.Parse(row.Children[1].ToPlainTextString());
                            model.away = float.Parse(row.Children[2].ToPlainTextString());
                            this.FillOdds1x2History(model);
                            string[] t2 = row.Children[3].ToPlainTextString().Replace("showtime(", "").Replace(")", "").Split(',');
                            int yy = int.Parse(t2[0]);
                            int mm = int.Parse(t2[1].Remove(2));
                            int dd = int.Parse(t2[2]);
                            int hh = int.Parse(t2[3]);
                            int mi = int.Parse(t2[4]);
                            int ss = int.Parse(t2[5]);
                            model.time = new DateTime(yy, mm, dd, hh, mi, ss, DateTimeKind.Utc).AddHours(8d);
                            if (model.time > time)
                            {
                                continue;
                            }
                            dal.Add(model);
                            count++;
                        }
                    }
                }
            }
            JSONHelper json = new JSONHelper();
            json.success = true;
            json.totlalCount = count;
            return json.ToString();
        }

예제 #5

0

파일 보기

파일: Program.cs 프로젝트: CaseyYang/WebProjects

 static void GetStoryOfRevolution()
 {
     StreamReader reader = new StreamReader("catalogue.htm");
     Lexer lexer = new Lexer(reader.ReadToEnd());
     Parser parser = new Parser(lexer);
     HasAttributeFilter linkFilterByParent = new HasAttributeFilter("class", "row zhangjieUl");
     HasAttributeFilter linkFilterByClass = new HasAttributeFilter("class", "fontStyle2 colorStyleLink");
     AndFilter linkFilter = new AndFilter(new HasParentFilter(linkFilterByParent, true), linkFilterByClass);
     NodeList linkNodeList = parser.Parse(linkFilter);
     List<string> linkUrlList = new List<string>(linkNodeList.Size());
     List<string> chapterHtmlContentList = new List<string>(linkNodeList.Size());
     HttpWebRequest httpWebRequest;
     StreamReader chapterReader = null;
     for (int i = 0; i < linkNodeList.Size(); i++)
     {
         ATag linkNode = (ATag)linkNodeList[i];
         linkUrlList.Add(linkNode.Link);
         httpWebRequest = HttpWebRequest.CreateHttp("http://www.mlxiaoshuo.com" + linkUrlList[linkUrlList.Count - 1]);
         chapterReader = new StreamReader(new BufferedStream(httpWebRequest.GetResponse().GetResponseStream(), 4 * 200 * 1024));
         string chapterHtmlContent = chapterReader.ReadToEnd();
         chapterHtmlContentList.Add(chapterHtmlContent);
         Console.WriteLine("第" + (i + 1) + "个页面获取完毕！");
     }
     chapterReader.Close();
     HasAttributeFilter praghFilter = new HasAttributeFilter("class", "textP fontStyle2 colorStyleText");
     StreamWriter writer = new StreamWriter("革命逸事.txt");
     for (int i = 0; i < chapterHtmlContentList.Count; i++)
     {
         writer.WriteLine("第" + (i + 1) + "章");
         lexer = new Lexer(chapterHtmlContentList[i]);
         parser = new Parser(lexer);
         NodeList praghNodeList = parser.Parse(praghFilter);
         if (praghNodeList.Size() == 1)
         {
             for (int j = 0; j < praghNodeList[0].Children.Size(); j++)
             {
                 if (praghNodeList[0].Children[j].GetType().Equals(typeof(ParagraphTag)))
                 {
                     ParagraphTag praghTag = (ParagraphTag)praghNodeList[0].Children[j];
                     writer.WriteLine("    " + praghTag.StringText);
                 }
             }
             writer.WriteLine();
         }
         else
         {
             Console.WriteLine("第" + (i + 1) + "页中，判断段落的标准出错！");
         }
     }
     writer.Close();
 }

예제 #6

0

파일 보기

파일: PlayTimeSpider.cs 프로젝트: Letractively/swift-test-one

        public List<PlayTime> getPlayTimes(string xmlFile)
        {
            Match match = Regex.Match(xmlFile, @"\d\d\d\d");
            string cinemaID = match.Value;//电影院的ID
            List<PlayTime> playTimes = new List<PlayTime>();
            string html = File.ReadAllText(xmlFile);

            Lexer lexer = new Lexer(html);
            Parser playParser = new Parser(lexer);
            NodeFilter playFilter = new HasAttributeFilter("CLASS", "px14");
            NodeList playTimeList = playParser.ExtractAllNodesThatMatch(playFilter);
            if (playTimeList.Count >= 1)
            {
                for (int i = 0; i < playTimeList.Count; i++)
                {
                    PlayTime playTime = new PlayTime();
                    ITag playTag = (playTimeList[i] as ITag);
                    ITag idTag = (playTag.FirstChild as ITag);
                    if (idTag.Attributes != null)
                    {
                        string strID = idTag.Attributes["HREF"].ToString();
                        Match idMatch = Regex.Match(strID, @"\d\d\d\d\d\d");
                        if (idMatch.Success)
                        {
                            playTime.MovieID = int.Parse(idMatch.Value);
                        }
                        else
                        {
                            Match strMatch = Regex.Match(strID, @"\d\d\d\d\d");
                            if (strMatch.Success)
                            {
                                playTime.MovieID = int.Parse(strMatch.Value);
                            }
                        }

                    }
                    string strTime = playTag.NextSibling.NextSibling.ToPlainTextString();
                    char[] a = {'上','映'};
                    strTime = strTime.Trim(a);
                    playTime.Playtime = DateTime.Parse(strTime);
                    playTime.CinemaID = int.Parse(cinemaID);
                    playTime.PlayState = true;

                    playTimes.Add(playTime);
                }
                return playTimes;
            }
            return null;
        }

예제 #7

0

파일 보기

파일: ScriptScanner.cs 프로젝트: JamalAbuDayyeh/slowandsteadyparser

		/// <summary> Scan for script.
		/// Accumulates text from the page, until &lt;/[a-zA-Z] is encountered.
		/// </summary>
		/// <param name="tag">The tag this scanner is responsible for.
		/// </param>
		/// <param name="lexer">The source of CDATA.
		/// </param>
		/// <param name="stack">The parse stack, <em>not used</em>.
		/// </param>
		public override ITag Scan(ITag tag, Lexer lexer, NodeList stack)
		{
			System.String language;
			System.String code;
			INode content;
			int position;
			INode node;
			TagAttribute attribute;
			System.Collections.ArrayList vector;
			
			if (tag is ScriptTag)
			{
				language = ((ScriptTag) tag).Language;
				if ((null != language) && (language.ToUpper().Equals("JScript.Encode".ToUpper()) || language.ToUpper().Equals("VBScript.Encode".ToUpper())))
				{
					code = ScriptDecoder.Decode(lexer.Page, lexer.Cursor);
					((ScriptTag) tag).ScriptCode = code;
				}
			}
			content = lexer.ParseCDATA(!STRICT);
			position = lexer.Position;
			node = lexer.NextNode(false);
			if (null != node)
				if (!(node is ITag) || !(((ITag) node).IsEndTag() && ((ITag) node).TagName.Equals(tag.Ids[0])))
				{
					lexer.Position = position;
					node = null;
				}
			
			// build new end tag if required
			if (null == node)
			{
				attribute = new TagAttribute("/script", null);
				vector = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
				vector.Add(attribute);
				node = lexer.NodeFactory.CreateTagNode(lexer.Page, position, position, vector);
			}
			tag.SetEndTag((ITag) node);
			if (null != content)
			{
				tag.Children = new NodeList(content);
				content.Parent = tag;
			}
			node.Parent = tag;
			tag.DoSemanticAction();
			
			return (tag);
		}

예제 #8

0

파일 보기

파일: Program.cs 프로젝트: CaseyYang/WebProjects

 static string GetBlogTitle(string htmlContent)
 {
     string result = "";
     Lexer lexer = new Lexer(htmlContent);
     Parser parser = new Parser(lexer);
     NodeList titleList = parser.Parse(titleFilter);
     if (titleList.Count == 1)
     {
         TitleTag titleTag = (TitleTag)titleList[0];
         result = titleTag.Title;
     }
     else
     {
         Console.WriteLine("获取标题信息出错！");
     }
     return result;
 }

예제 #9

0

파일 보기

파일: ScollGoalBLL.cs 프로젝트: opo30/bet-helper

        /// <summary>
        /// 获得列表
        /// </summary>
        /// <returns></returns>
        public List<OddsLiveMatch> GetMatchScrollOdds(string matchid,string urlparams)
        {
            List<OddsLiveMatch> liveMatchList = new List<OddsLiveMatch>();
            try
            {
                HttpHelper h = new HttpHelper();
                Cookie lng = new Cookie("lng", "2");
                lng.Domain = domain;
                h.CookieContainer.Add(lng);
                //string zoudi = h.GetHtml("https://" + domain + "/default.aspx" + urlparams);
                string zoudi = h.GetHtml(urlparams);
                if (!string.IsNullOrEmpty(zoudi))
                {
                    #region 分析网页html节点
                    Lexer lexer = new Lexer(zoudi);
                    Parser parser = new Parser(lexer);
                    NodeList bodyNodes = parser.Parse(new TagNameFilter("HTML"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("BODY"))[0].Children;
                    ITag divNode = bodyNodes.ExtractAllNodesThatMatch(new TagNameFilter("FORM"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("DIV"))[0] as ITag;
                    if (divNode.Attributes["ID"].Equals("PageBody"))
                    {
                        NodeList dataDivList = divNode.Children.SearchFor(typeof(Winista.Text.HtmlParser.Tags.Div));
                        if (dataDivList[0].ToPlainTextString() == "走地盤")
                        {
                            if (dataDivList[2].ToPlainTextString() == "全場賽果")
                            {
                                OddsLiveHistory liveHistory = new OddsLiveHistory();
                                liveHistory.matchid = matchid;
                                liveHistory.home = float.Parse(dataDivList[3].ToPlainTextString().Split(' ')[0]);
                                liveHistory.draw = float.Parse(dataDivList[5].ToPlainTextString().Split(' ')[0]);
                                liveHistory.away = float.Parse(dataDivList[7].ToPlainTextString().Split(' ')[0]);
                                liveHistory.time = DateTime.Now;
                                dal.AddHistory(liveHistory);
                            }
                        }
                    }
                    #endregion 分析网页html节点
                }
            }
            catch (Exception)
            {

            }
            return liveMatchList;
        }

예제 #10

0

파일 보기

파일: CinemaSpider.cs 프로젝트: Letractively/swift-test-one

 public bool getCinemaFive(string html, out string dining, out string park, out string gameCenter, out string intro3D, out string introVIP)
 {
     //string vip = string.Empty;
     //string html = File.ReadAllText(url);
     dining = string.Empty;
     park = string.Empty;
     gameCenter = string.Empty;
     intro3D = string.Empty;
     introVIP = string.Empty;
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     NodeFilter nodeFilter = new HasAttributeFilter("CLASS", "c_000");
     NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter);
     for (int i = 0; i < nodeList.Count; i++)
     {
         INode node = nodeList[i];
         ITag tagPar = (node.Parent as ITag);
         ITag tagSib = (node.PreviousSibling as ITag);
         if (tagSib.Attributes["CLASS"] != null)
         {
             switch (tagSib.Attributes["CLASS"].ToString())
             {
                 case "ico_cside1 mr12":
                     dining = tagPar.ToPlainTextString();
                     break;
                 case "ico_cside2 mr12":
                     park = tagPar.ToPlainTextString();
                     break;
                 case "ico_cside3 mr12":
                     gameCenter = tagPar.ToPlainTextString();
                     break;
                 case "ico_cside5 mr12":
                     intro3D = tagPar.ToPlainTextString();
                     break;
                 case "ico_cside7 mr12":
                     introVIP = tagPar.ToPlainTextString();
                     break;
             }
         }
     }
     return true;
     //throw new NotImplementedException();
 }

예제 #11

0

파일 보기

파일: Spider.cs 프로젝트: Letractively/swift-test-one

 public static string getAttValue(string html, string tag, string attribute, string attValue, string attributeV)
 {
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     string value = string.Empty;
     NodeFilter nodeFilter = new HasAttributeFilter(attribute, attValue);
     NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter);
     if (nodeList.Count == 1)
     {
         ITag tagNode = (nodeList[0] as ITag);
         if (tagNode.Attributes != null)
         {
             return tagNode.Attributes[attributeV].ToString();
         }
     }
     //for (int i = 0; i < nodeList.Count; i++)
     //{
     //    INode node = nodeList[i];
     //    ITag tagNode = (node as ITag);
     //    if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
     //    {
     //        foreach (string key in tagNode.Attributes.Keys)
     //        {
     //            if (key.Contains("<TAGNAME>"))
     //            {
     //                continue;
     //            }
     //            if (key.Contains(attribute))
     //            {
     //                if (tagNode.Attributes[key].ToString() == attValue)
     //                {
     //                    value = tagNode.Attributes[attributeV].ToString();
     //                    return value;
     //                }
     //            }
     //        }
     //    }
     //}
     return null;
 }

예제 #12

0

파일 보기

파일: StyleScanner.cs 프로젝트: JamalAbuDayyeh/slowandsteadyparser

		/// <summary> Scan for style definitions.
		/// Accumulates text from the page, until &lt;/[a-zA-Z] is encountered.
		/// </summary>
		/// <param name="tag">The tag this scanner is responsible for.
		/// </param>
		/// <param name="lexer">The source of CDATA.
		/// </param>
		/// <param name="stack">The parse stack, <em>not used</em>.
		/// </param>
		public override ITag Scan(ITag tag, Lexer lexer, NodeList stack)
		{
			INode content;
			int position;
			INode node;
			TagAttribute attribute;
			System.Collections.ArrayList vector;
			
			content = lexer.ParseCDATA();
			position = lexer.Position;
			node = lexer.NextNode(false);
			if (null != node)
				if (!(node is ITag) || !(((ITag) node).IsEndTag() && ((ITag) node).TagName.Equals(tag.Ids[0])))
				{
					lexer.Position = position;
					node = null;
				}
			
			// build new end tag if required
			if (null == node)
			{
				attribute = new TagAttribute("/style", null);
				vector = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
				vector.Add(attribute);
				node = lexer.NodeFactory.CreateTagNode(lexer.Page, position, position, vector);
			}
			tag.SetEndTag((ITag) node);
			if (null != content)
			{
				tag.Children = new NodeList(content);
				content.Parent = tag;
			}
			node.Parent = tag;
			tag.DoSemanticAction();
			
			return (tag);
		}

예제 #13

0

파일 보기

파일: CinemaSpider.cs 프로젝트: Letractively/swift-test-one

        public float getCinemaGrade(string html)
        {
            //string tag = "dd";
            //string attribute = "CLASS";
            //string attValue = "total";
            //string left = Spider.getValue(html, tag, attribute, attValue);
            //string tag2 = "dd";
            //string attribute2 = "CLASS";
            //string attValue2 = "total2";
            //string right = Spider.getValue(html, tag2, attribute2, attValue2);
            //string grade = left + right;
            ////return float.Parse(grade);
            //return 1.1f;
            //throw new NotImplementedException();

            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);
            NodeFilter nodeFilter = new HasAttributeFilter("CLASS", "point ml12 px18");
            NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter);
            if (nodeList.Count == 1)
            {
                INode node = nodeList[0];
                ITag tagLeft = (node.FirstChild as ITag);
                ITag tagRight = (node.LastChild as ITag);
                string left = tagLeft.ToPlainTextString();
                string right = tagRight.ToPlainTextString();
                string strGrade = left + right;
                return float.Parse(strGrade);
            }
            return 7.0f;
        }

예제 #14

0

파일 보기

파일: SpiderRrxf.cs 프로젝트: wangsying/SpiderJobs

        public ATag ParseProductUrl(string html)
        {
            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);

            NodeFilter filter = new LinkRegexFilter(@"lookcp\.php\?cpid\=\d{0,}");
            NodeList alist = parser.Parse(filter);
            ATag a = alist[0] as ATag;
            a.Link = "http://rrxf.cn/product/" + a.Link;
            return a;
        }

예제 #15

0

파일 보기

파일: CompositeTagScanner.cs 프로젝트: JamalAbuDayyeh/slowandsteadyparser

		/// <summary> Finish off a tag.
		/// Perhap add a virtual end tag.
		/// Set the end tag parent as this tag.
		/// Perform the semantic acton.
		/// </summary>
		/// <param name="tag">The tag to finish off.
		/// </param>
		/// <param name="lexer">A lexer positioned at the end of the tag.
		/// </param>
		protected internal virtual void FinishTag(ITag tag, Lexer lexer)
		{
			if (null == tag.GetEndTag())
			{
				tag.SetEndTag(CreateVirtualEndTag(tag, lexer, lexer.Page, lexer.Cursor.Position));
			}
			tag.GetEndTag().Parent = tag;
			tag.DoSemanticAction();
		}

예제 #16

0

파일 보기

파일: SpiderRrxf.cs 프로젝트: wangsying/SpiderJobs

        public void ParseProducts(ATag a)
        {
            string html = GetHtml(a.Link.Replace("../", "http://rrxf.cn/"));

            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);

            NodeFilter nav = new HasAttributeFilter("class", "photoyi");
            NodeList navNodes = parser.Parse(nav);

            if (navNodes == null)
                return;

            int length = navNodes.Count;
            for (int i = 0; i < length; i++)
            {
                ATag link = ParseProductUrl(navNodes[i].ToHtml());
                Console.WriteLine(link.Link);
                ParseProduct(link);
            }
        }

예제 #17

0

파일 보기

파일: CompositeTagScanner.cs 프로젝트: JamalAbuDayyeh/slowandsteadyparser

		/// <summary> Creates an end tag with the same name as the given tag.</summary>
		/// <param name="tag">The tag to end.
		/// </param>
		/// <param name="lexer">The object containg the node factory.
		/// </param>
		/// <param name="page">The page the tag is on (virtually).
		/// </param>
		/// <param name="position">The offset into the page at which the tag is to
		/// be anchored.
		/// </param>
		/// <returns> An end tag with the name '"/" + tag.getTagName()' and a start
		/// and end position at the given position. The fact these positions are
		/// equal may be used to distinguish it as a virtual tag later on.
		/// </returns>
		protected internal virtual ITag CreateVirtualEndTag(ITag tag, Lexer lexer, Page page, int position)
		{
			ITag ret;
			System.String name;
			System.Collections.ArrayList attributes;
			
			name = "/" + tag.RawTagName;
			attributes = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
			attributes.Add(new TagAttribute(name, (System.String) null));
			ret = lexer.NodeFactory.CreateTagNode(page, position, position, attributes);
			
			return (ret);
		}

예제 #18

0

파일 보기

파일: Spider.cs 프로젝트: Letractively/swift-test-one

 public static List<string> getValues(string html, string tag, string attribute, string attValue)
 {
     Lexer lexer = new Lexer(html);
     Parser parser = new Parser(lexer);
     string value = string.Empty;
     List<string> values = new List<string>();
     NodeFilter nodeFilter = new HasAttributeFilter(attribute, attValue);
     NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter);
     for (int i = 0; i < nodeList.Count; i++)
     {
         INode node = nodeList[i];
         ITag tagNode = (node as ITag);
         if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
         {
             value = tagNode.ToPlainTextString();
             values.Add(value);
         }
     }
     //for (int i = 0; i < nodeList.Count; i++)
     //{
     //    INode node = nodeList[i];
     //    ITag tagNode = (node as ITag);
     //    if (tagNode.Attributes != null && tagNode.Attributes.Count > 0)
     //    {
     //        foreach (string key in tagNode.Attributes.Keys)
     //        {
     //            if (key.Contains("<TAGNAME>"))
     //            {
     //                continue;
     //            }
     //            if (key.Contains(attribute))
     //            {
     //                if (tagNode.Attributes[key].ToString() == attValue)
     //                {
     //                    value = tagNode.ToPlainTextString();
     //                    values.Add(value);
     //                }
     //            }
     //        }
     //    }
     //}
     return values;
 }

예제 #19

0

파일 보기

파일: CardRepository.cs 프로젝트: linghuyong/DeckManager

        public void GetFromWeb(IGetFromWebNotify notifier)
        {
            Directory.CreateDirectory(Config.ImagePath);

            if (notifier != null)
                notifier.Notity(String.Format("���� {0}", Config.Uri), 0.0f);
            WebClient webClient = new WebClient();
            webClient.Encoding = Encoding.UTF8;
            String strHtml = webClient.DownloadString(Config.Uri);

            if (notifier != null)
                notifier.Notity("����html�ĵ�", 0.0f);
            Lexer lexer = new Lexer(strHtml);
            Parser parser = new Parser(lexer);
            AndFilter andFilter = new AndFilter(new NodeClassFilter(typeof(TableRow)), new OrFilter(new HasAttributeFilter("class", "even"), new HasAttributeFilter("class", "odd")));
            NodeList htmlNodes = parser.ExtractAllNodesThatMatch(andFilter);
            lock (this)
            {
                m_Cards = new List<Card>();
                foreach (INode node in htmlNodes.ToNodeArray())
                {
                    int iFiledIndex = 0;
                    Card card = new Card();
                    foreach (INode subNode in node.Children.ToNodeArray())
                    {
                        if (subNode is TextNode)
                        {
                            continue;
                        }

                        switch (iFiledIndex)
                        {
                            case 0:
                                card.ID = Convert.ToInt32(subNode.FirstChild.GetText());
                                card.ImagePath = Path.Combine(Config.ImagePath, card.ID.ToString() + ".jpg");
                                break;
                            case 1:
                                card.Name = subNode.FirstChild.FirstChild.GetText();
                                break;
                            case 2:
                                StringHelper.FillCardLongInfo(subNode.FirstChild.GetText(), card);
                                break;
                            case 3:
                                if (subNode.FirstChild != null)
                                {
                                    card.ManaCost = subNode.FirstChild.GetText();
                                }
                                else
                                {
                                    card.ManaCost = String.Empty;
                                }
                                break;
                            case 4:
                                card.Rare = subNode.FirstChild.GetText();
                                break;
                        }

                        iFiledIndex++;
                    }
                    m_Cards.Add(card);
                }
            }

            XmlSerializer s = new XmlSerializer(typeof(List<Card>));
            FileStream fstream = new FileStream(Config.CardsXml, FileMode.CreateNew);
            s.Serialize(fstream, m_Cards);
            fstream.Close();

            foreach (Card card in m_Cards)
            {
                if (notifier != null)
                    notifier.Notity(String.Format("��ȡ��Ƭ\"{0}\"��Ϣ", card.Name), 1.0f / m_Cards.Count);
                webClient.DownloadFile(Path.Combine(Config.BaseImageUri, card.ID.ToString() + ".jpg"), card.ImagePath);
            }
        }

예제 #20

0

파일 보기

파일: Program.cs 프로젝트: CaseyYang/WebProjects

 static int OneDayOneBeauty(string date)
 {
     try
     {
         string htmlContent = "";
         string url = oneDayOneBeautyBaseUrl + date + "/";
         HttpWebRequest httpWebRequest = HttpWebRequest.CreateHttp(url);
         httpWebRequest.Method = "GET";
         HttpWebResponse httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
         if (httpWebResponse.StatusCode.Equals(HttpStatusCode.OK))
         {
             StreamReader reader = new StreamReader(new BufferedStream(httpWebResponse.GetResponseStream(), 4 * 200 * 1024));
             htmlContent = reader.ReadToEnd();
             //调试代码
             //StreamWriter fw = new StreamWriter("debug.html");
             //fw.Write(htmlContent);
             //fw.Close();
             //调试完毕
             httpWebResponse.Close();
             reader.Close();
         }
         if (!htmlContent.Equals(""))
         {
             Lexer lexer = new Lexer(htmlContent);
             Parser parser = new Parser(lexer);
             parser.AnalyzePage();
             NodeList divList = parser.ExtractAllNodesThatMatch(OneDayOneBeautyImgFilter);
             if (divList.Count == 0)
             {
                 parser.AnalyzePage();
                 divList = parser.ExtractAllNodesThatMatch(OneDayOneBeautyImgFilter2);
             }
             for (int i = 0; i < divList.Count; i++)
             {
                 ImageTag imgNode = (ImageTag)divList[i];
                 //2014年5月16日根据网页结构修改
                 GetPicUrlsFromBeautyPersonalPage(imgNode, i, 1);
             }
             return divList.Count;
         }
         else
         {
             Console.WriteLine("得到的HTML为空！");
             return 0;
         }
     }
     catch (WebException e)
     {
         HttpWebResponse httpWebResponse = (HttpWebResponse)e.Response;
         if (httpWebResponse.StatusCode.Equals(HttpStatusCode.NotFound))
         {
             Console.WriteLine("网页未找到！");
         }
         else
         {
             Console.WriteLine("访问网页出错！状态码：" + httpWebResponse.StatusCode);
         }
         httpWebResponse.Close();
         return 0;
     }
 }

예제 #21

0

파일 보기

파일: FansAndFollowCrawler.cs 프로젝트: CaseyYang/WebProjects

        /// <summary>
        /// 从网页版微博中获取微博信息
        /// </summary>
        /// <param name="fansList">保存爬得的粉丝数组</param>
        public void GetInfoFromHtml(List<Fan> fansList)
        {
            Lexer lexer = new Lexer(currentHtmlContent);
            Parser parser = new Parser(lexer);
            //获取包含每条微博的div标记列表
            NodeList fansNodeList = parser.Parse(fanFilter);
            for (int i = 0; i < fansNodeList.Size(); i++)
            {
                Fan fan = new Fan();
                //获取包含一个粉丝的<li>标记
                Bullet fanBullet = (Bullet)fansNodeList[i];

                #region 获取该粉丝头像
                NodeList fanPortraitNodeList = fanBullet.Children.ExtractAllNodesThatMatch(portraitFilter, true);
                if (fanPortraitNodeList.Size() == 1)
                {
                    Div fanPortraitDiv = (Div)fanPortraitNodeList[0];
                    NodeList imgNodeList = fanPortraitDiv.Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ImageTag)), true);
                    if (imgNodeList.Size() == 1)
                    {
                        ImageTag imgNode = (ImageTag)imgNodeList[0];
                        if (imgNode.Attributes.ContainsKey("SRC") && imgNode.Attributes.ContainsKey("ALT"))
                        {
                            string imgUrl = imgNode.GetAttribute("SRC");
                            string imgName = imgNode.GetAttribute("ALT");
                            fan.Name = imgName;
                            WebClient wc = new WebClient();//使用WebClient是因为下载用户头像不用登录cookie
                            wc.DownloadFileAsync(new Uri(imgUrl), @"portrait\" + imgName + ".jpg");
                            wc.DownloadFileCompleted += wc_DownloadFileCompleted;
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "个粉丝中，<img>标记缺少必要的属性！");
                        }

                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中，获取img标记出错！");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中，获取粉丝头像的标准出错！");
                }
                #endregion

                #region 获取该粉丝的关注数/粉丝数/微博数
                NodeList fanConnectNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanConnectFilter, true);
                if (fanConnectNodeList.Size() == 1)
                {
                    NodeList ATagList = fanConnectNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), true);
                    if (ATagList.Size() == 3)
                    {
                        for (int j = 0; j < 3; j++)
                        {
                            ATag aTag = (ATag)ATagList[j];
                            switch (j)
                            {
                                case 0:
                                    if (aTag.Attributes.ContainsKey("HREF") && aTag.GetAttribute("HREF").Contains("follow"))
                                    {
                                        fan.FollowCount = Int32.Parse(aTag.StringText);
                                    }
                                    else
                                    {
                                        Console.WriteLine("第" + i + "个粉丝中，获取粉丝的关注数出错！");
                                    }
                                    break;
                                case 1:
                                    if (aTag.Attributes.ContainsKey("HREF") && aTag.GetAttribute("HREF").Contains("fans"))
                                    {
                                        fan.FansCount = Int32.Parse(aTag.StringText);
                                    }
                                    else
                                    {
                                        Console.WriteLine("第" + i + "个粉丝中，获取粉丝的粉丝数出错！");
                                    }
                                    break;
                                default:
                                    fan.FeedsCount = Int32.Parse(aTag.StringText);
                                    break;
                            }
                        }
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中，获取粉丝关注数/粉丝数/微博数的数量出错！");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中，获取粉丝关注数/粉丝数/微博数的标准出错！");
                }
                #endregion

                #region 获取该粉丝的简介信息
                NodeList fanInfoNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanInfoFilter, true);
                if (fanInfoNodeList.Size() == 1)
                {
                    //Console.WriteLine(fanInfoNodeList[0].Parent.ToHtml());
                    Div fanInfoDiv = (Div)fanInfoNodeList[0];
                    string intro = fanInfoDiv.StringText;
                    if (intro.Substring(0, 2).Equals("简介"))
                    {
                        fan.Introduction = intro.Substring(3, intro.Length - 3).Replace("\n", " ").Replace("\t", " ");
                    }
                }
                else
                {
                    if (fanInfoNodeList.Size() == 0)
                    {
                        fan.Introduction = "";
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中，获取粉丝简介的标准出错！");
                    }
                }
                #endregion

                #region 获取该粉丝的UserID、地点和性别信息；校验该粉丝的用户名信息
                NodeList fanLocationNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanNameFilter, true);
                if (fanLocationNodeList.Size() == 1)
                {
                    //获取粉丝的UserID信息；校验该粉丝的用户名信息
                    NodeList aTagNodeList = fanLocationNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), true);
                    if (aTagNodeList.Size() >= 1)
                    {
                        ATag nameNode = (ATag)aTagNodeList[0];
                        if (nameNode.Attributes.ContainsKey("USERCARD") && nameNode.Attributes.ContainsKey("HREF"))
                        {
                            //获取粉丝的UserID信息
                            string uidStr = nameNode.GetAttribute("USERCARD");
                            if (uidStr.Substring(0, 3).Equals("id="))
                            {
                                fan.UserID = uidStr.Substring(3, uidStr.Length - 3);
                            }

                            //获取粉丝的微博链接
                            string linkUrl = nameNode.GetAttribute("HREF");
                            fan.LinkURL = "http://www.weibo.com" + linkUrl;
                        }
                        else
                        {
                            Console.WriteLine("第" + i + "个粉丝中，包含用户id和链接的<a>标记中缺少必要的属性！");
                        }
                        //校验该粉丝的用户名信息
                        if (!nameNode.StringText.Equals(fan.Name))
                        {
                            Console.WriteLine("第" + i + "个粉丝中，用户名与用户头像文字描述不一致！");
                        }
                    }

                    //获取粉丝的性别和地点信息
                    NodeList locationNodeList = fanLocationNodeList[0].Children.ExtractAllNodesThatMatch(new HasAttributeFilter("class", "addr"), true);
                    if (locationNodeList.Size() == 1)
                    {
                        string locationStr = "";
                        for (int j = 0; j < locationNodeList[0].Children.Size(); j++)
                        {
                            INode node = locationNodeList[0].Children[j];
                            if (node.GetType().Equals(typeof(TextNode)))
                            {
                                TextNode tNode = (TextNode)node;
                                locationStr += tNode.ToPlainTextString();
                            }
                            if (node.GetType().Equals(typeof(TagNode)))
                            {
                                TagNode tNode = (TagNode)node;
                                if (tNode.Attributes.ContainsKey("CLASS"))
                                {
                                    if (tNode.GetAttribute("CLASS").Contains("female"))//必须先female，因为female中也含有male，如果male在前，则所有用户均符合该条件了= =
                                    {
                                        fan.Gender = "female";
                                    }
                                    else
                                    {
                                        if (tNode.GetAttribute("CLASS").Contains("male"))
                                        {
                                            fan.Gender = "male";
                                        }
                                        else
                                        {
                                            fan.Gender = "unknown";
                                            Console.WriteLine("第" + i + "个粉丝性别不明！");
                                        }
                                    }
                                }
                            }
                        }
                        fan.Location = locationStr.Trim();
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中，获取粉丝地点的标准出错！");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中，获取该粉丝的UserID、地点和性别信息的标准出错！");
                }
                #endregion

                #region 获取该粉丝关注用户的方式
                NodeList followMethodNodeList = fanBullet.Children.ExtractAllNodesThatMatch(followMethodFilter, true);
                if (followMethodNodeList.Size() == 1)
                {
                    NodeList methodNodeList = followMethodNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)));
                    if (methodNodeList.Size() == 1)
                    {
                        ATag methodNode = (ATag)methodNodeList[0];
                        fan.FollowMethod = methodNode.StringText.Trim();
                    }
                    else
                    {
                        Console.WriteLine("第" + i + "个粉丝中，获取该粉丝关注用户的方式的数量出错！");
                    }
                }
                else
                {
                    Console.WriteLine("第" + i + "个粉丝中，获取该粉丝关注用户的方式的标准出错！");
                }
                #endregion

                fansList.Add(fan);
            }
        }

예제 #22

0

파일 보기

파일: DianPinCrawler.cs 프로젝트: CaseyYang/WebProjects

 public void GetInfoFromHtml(int currentPage)
 {
     Lexer lexer = new Lexer(currentHtml);
     Parser parser = new Parser(lexer);
     NodeList poiHeadList = parser.Parse(poiListFilter);
     if (poiHeadList.Count == 1)
     {
         NodeList poiNodeList = poiHeadList[0].Children.ExtractAllNodesThatMatch(poiFilter, false);
         int numCount = 0;
         for (int i = 0; i < poiNodeList.Count; i++)
         {
             POI poi = new POI();
             DefinitionListBullet poiNode = (DefinitionListBullet)poiNodeList[i];
             if (poiNode.TagName.Equals("DD"))
             {
                 numCount++;
                 poi.Page = currentPage;
                 poi.Number = numCount;
                 #region 获取口味、环境和服务评分，以及获取星级
                 NodeList tasteNodeList = poiNode.Children.ExtractAllNodesThatMatch(tasteFilter, true);
                 NodeList environmentNodeList = poiNode.Children.ExtractAllNodesThatMatch(environmentFilter, true);
                 NodeList serviceNodeList = poiNode.Children.ExtractAllNodesThatMatch(serviceFilter, true);
                 if (tasteNodeList.Count == 1 && environmentNodeList.Count == 1 && serviceNodeList.Count == 1)
                 {
                     Span spanNode = (Span)tasteNodeList[0];
                     if (!spanNode.ToPlainTextString().Equals("-"))
                     {
                         poi.TasteRemark = Int32.Parse(spanNode.ToPlainTextString());
                     }
                     spanNode = (Span)environmentNodeList[0];
                     if (!spanNode.ToPlainTextString().Equals("-"))
                     {
                         poi.EnvironmentRemark = Int32.Parse(spanNode.ToPlainTextString());
                     }
                     spanNode = (Span)serviceNodeList[0];
                     if (!spanNode.ToPlainTextString().Equals("-"))
                     {
                         poi.ServiceRemark = Int32.Parse(spanNode.ToPlainTextString());
                     }
                     #region 获取星级
                     INode rankNodeOfParent = spanNode.Parent.NextSibling.NextSibling;
                     if (rankNodeOfParent.Children != null && rankNodeOfParent.Children.Count >= 1)
                     {
                         INode rankNodeCandidate = rankNodeOfParent.Children[0];
                         if (rankNodeCandidate.GetType().Equals(typeof(Span)))
                         {
                             Span rankNode = (Span)rankNodeCandidate;
                             string rank = rankNode.GetAttribute("TITLE");
                             if (rank.Contains("五"))
                             {
                                 poi.Rank = 5;
                             }
                             else
                             {
                                 if (rank.Contains("四"))
                                 {
                                     poi.Rank = 4;
                                 }
                                 else
                                 {
                                     if (rank.Contains("三"))
                                     {
                                         poi.Rank = 3;
                                     }
                                     else
                                     {
                                         if (rank.Contains("二"))
                                         {
                                             poi.Rank = 2;
                                         }
                                         else
                                         {
                                             if (rank.Contains("一"))
                                             {
                                                 poi.Rank = 1;
                                             }
                                         }
                                     }
                                 }
                             }
                         }
                     }
                     #endregion
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中，判断口味、环境和服务的标准出错！");
                 }
                 #endregion
                 #region 获取平均消费
                 NodeList averageNodeList = poiNode.Children.ExtractAllNodesThatMatch(averageFilter, true);
                 if (averageNodeList.Count == 1)
                 {
                     INode averageNode = averageNodeList[0];
                     if (averageNode.NextSibling.NextSibling.GetType().Equals(typeof(TextNode)))
                     {
                         string cost = ((TextNode)averageNode.NextSibling.NextSibling).ToPlainTextString();
                         poi.AverageCost = Int32.Parse(cost);
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中，判断平均消费的标准出错！");
                 }
                 #endregion
                 #region 获取点评数
                 NodeList commentNodeList = poiNode.Children.ExtractAllNodesThatMatch(commentFilter, true);
                 if (commentNodeList.Count == 1)
                 {
                     INode commentNode = commentNodeList[0];
                     if (commentNode.GetType().Equals(typeof(ATag)))
                     {
                         string commentNum = ((ATag)commentNode).StringText;
                         if (commentNum.Substring(commentNum.Length - 3, 3).Equals("封点评"))
                         {
                             commentNum = commentNum.Substring(0, commentNum.Length - 3);
                         }
                         poi.CommentCount = Int32.Parse(commentNum);
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中，判断点评数的标准出错！");
                 }
                 #endregion
                 #region 获取店名
                 NodeList nameNodeList = poiNode.Children.ExtractAllNodesThatMatch(nameFilter, true);
                 if (nameNodeList.Count == 1)
                 {
                     INode nameNode = nameNodeList[0];
                     if (nameNode.GetType().Equals(typeof(ATag)))
                     {
                         poi.Name = ((ATag)nameNode).StringText;
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中，判断店名的标准出错！");
                 }
                 #endregion
                 #region 获取地址和电话
                 NodeList addressNodeList = poiNode.Children.ExtractAllNodesThatMatch(addressFilter, true);
                 if (addressNodeList.Count == 1)
                 {
                     NodeList districtNodeList = addressNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)));
                     if (districtNodeList.Count == 1)
                     {
                         ATag districtTag = (ATag)districtNodeList[0];
                         string address = districtTag.ToPlainTextString();
                         if (districtTag.NextSibling.GetType().Equals(typeof(TextNode)))
                         {
                             TextNode detailAddressNode = (TextNode)districtTag.NextSibling;
                             string detailAddress = detailAddressNode.ToPlainTextString();
                             detailAddress = detailAddress.Trim();
                             string phoneStr = detailAddress.Substring(detailAddress.Length - 8, 8);
                             poi.Phone = phoneStr;
                             address += detailAddress.Substring(0, detailAddress.Length - 8);
                         }
                         char[] removeChrVector = { ' ', '\n', '\t' };
                         address = address.Trim(removeChrVector);
                         foreach (char c in removeChrVector)
                         {
                             address = address.Replace(c.ToString(), "");
                         }
                         poi.Address = address;
                     }
                     else
                     {
                         Console.WriteLine("第" + i + "条POI中，判断含地址的<a>标记的标准出错！");
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中，判断地址的标准出错！");
                 }
                 #endregion
                 #region 获取标签
                 NodeList tagsNodeList = poiNode.Children.ExtractAllNodesThatMatch(tagsFilter, true);
                 if (tagsNodeList.Count == 1)
                 {
                     INode tagsNode = tagsNodeList[0];
                     if (tagsNode.Children != null)
                     {
                         for (int j = 0; j < tagsNode.Children.Count; j++)
                         {
                             INode node = tagsNode.Children[j];
                             if (node.GetType().Equals(typeof(ATag)))
                             {
                                 poi.Tags.Add(node.ToPlainTextString());
                             }
                         }
                     }
                 }
                 else
                 {
                     Console.WriteLine("第" + i + "条POI中，判断标签的标准出错！");
                 }
                 #endregion
                 poiList.Add(poi);
             }
         }
     }
     else
     {
         Console.WriteLine("获取POI列表出错");
     }
 }

예제 #23

0

파일 보기

파일: Parser.cs 프로젝트: JamalAbuDayyeh/slowandsteadyparser

		/// <summary> Construct a parser using the provided lexer.
		/// A feedback object printing to {@link #STDOUT System.out} is used.
		/// This would be used to create a parser for special cases where the
		/// normal creation of a lexer on a URLConnection needs to be customized.
		/// </summary>
		/// <param name="lexer">The lexer to draw characters from.
		/// </param>
		public Parser(Lexer lexer):this(lexer, STDOUT)
		{
		}

예제 #24

0

파일 보기

파일: Parser.cs 프로젝트: JamalAbuDayyeh/slowandsteadyparser

		/// <summary> Construct a parser using the provided lexer and feedback object.
		/// This would be used to create a parser for special cases where the
		/// normal creation of a lexer on a URLConnection needs to be customized.
		/// </summary>
		/// <param name="lexer">The lexer to draw characters from.
		/// </param>
		/// <param name="fb">The object to use when information,
		/// warning and error messages are produced. If <em>null</em> no feedback
		/// is provided.
		/// </param>
		public Parser(Lexer lexer, IParserFeedBack fb)
		{
			Feedback = fb;
			if (null == lexer)
				throw new System.ArgumentException("lexer cannot be null");
			Lexer = lexer;
			NodeFactory = new PrototypicalNodeFactory();
		}

예제 #25

0

파일 보기

파일: Program.cs 프로젝트: CaseyYang/WebProjects

 static void BeautyFlow(int id)
 {
     HttpWebResponse httpWebResponse = null;
     try
     {
         string htmlContent = "";
         string url = BeautyFlowBaseUrl + id + "/";
         HttpWebRequest httpWebRequest = HttpWebRequest.CreateHttp(url);
         httpWebRequest.Method = "GET";
         httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
         if (httpWebResponse.StatusCode.Equals(HttpStatusCode.OK))
         {
             StreamReader reader = new StreamReader(new BufferedStream(httpWebResponse.GetResponseStream(), 4 * 200 * 1024));
             htmlContent = reader.ReadToEnd();
             //调试代码
             //StreamWriter fw = new StreamWriter("debug.html");
             //fw.Write(htmlContent);
             //fw.Close();
             //调试完毕
             httpWebResponse.Close();
             reader.Close();
         }
         if (!htmlContent.Equals(""))
         {
             Console.WriteLine("第一个html读取完成！");
             int startIndex = htmlContent.IndexOf("/girl/");
             int endIndex = htmlContent.IndexOf("/", startIndex + 6) + 1;
             string beautyMorePicturesLink = "http://curator.im" + htmlContent.Substring(startIndex, endIndex - startIndex);
             //Console.WriteLine(beautyMorePicturesLink);
             string htmlContentTwo = "";
             httpWebRequest = HttpWebRequest.CreateHttp(beautyMorePicturesLink);
             httpWebRequest.Method = "GET";
             httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse();
             if (httpWebResponse.StatusCode.Equals(HttpStatusCode.OK))
             {
                 StreamReader reader = new StreamReader(new BufferedStream(httpWebResponse.GetResponseStream(), 4 * 200 * 1024));
                 htmlContentTwo = reader.ReadToEnd();
                 httpWebResponse.Close();
                 reader.Close();
             }
             Console.WriteLine("第二个html读取完成！");
             Lexer lexer = new Lexer(htmlContentTwo);
             Parser parser = new Parser(lexer);
             parser.AnalyzePage();
             NodeList divList = parser.ExtractAllNodesThatMatch(BeautyNameFilter);
             string beautyName = "";
             if (divList.Count == 1)
             {
                 beautyName = divList[0].ToPlainTextString();
                 endIndex = beautyName.IndexOf('|') - 1;
                 beautyName = beautyName.Substring(0, endIndex);
             }
             else
             {
                 Console.WriteLine("获取正妹名称出错！ id=" + id);
                 Console.Read();
                 return;
             }
             parser.AnalyzePage();
             divList = parser.ExtractAllNodesThatMatch(BeautyFlowImgFilter);
             for (int i = 0; i < divList.Count; i++)
             {
                 ImageTag imgNode = (ImageTag)divList[i];
                 GetPicUrlsFromBeautyPersonalPage(imgNode, i, 2);
             }
         }
         else
         {
             Console.WriteLine("得到的HTML为空！");
             return;
         }
     }
     catch (Exception ex)
     {
         //if (httpWebResponse != null)
         //{
         //    httpWebResponse = (HttpWebResponse)ex.Response;
         //    if (!httpWebResponse.StatusCode.Equals(HttpStatusCode.NotFound))
         //    {
         //        Console.WriteLine("访问网页出错！状态码：" + httpWebResponse.StatusCode);
         //    }
         //    httpWebResponse.Close();
         //}
     }
 }

예제 #26

0

파일 보기

파일: SpiderRrxf.cs 프로젝트: wangsying/SpiderJobs

        public List<ATag> ParseCatelog(string html)
        {
            List<ATag> atags = new List<ATag>();

            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);

            NodeFilter nav = new HasAttributeFilter("class", "fenlei_list");
            NodeList navNodes = parser.Parse(nav);

            NodeFilter catelog = new LinkRegexFilter(@"^\.\./product/index\.php\?cplm\=\-\d\d\d\-$");
            catelog = new HasChildFilter(catelog);
            NodeList catelogNodes = navNodes[0].Children.ExtractAllNodesThatMatch(catelog);

            if(catelogNodes==null){
                return atags;
            }

            int length = catelogNodes.Count;
            for (int i=0;i<length;i++)
            {
                INode node = catelogNodes[i];
                ATag a = node.Children[0] as ATag;
                atags.Add(a);
            }

            return atags;
        }

예제 #27

0

파일 보기

파일: TagNode.cs 프로젝트: JamalAbuDayyeh/slowandsteadyparser

		/// <summary> Parses the given text to create the tag contents.</summary>
		/// <param name="text">A string of the form &lt;TAGNAME xx="yy"&gt;.
		/// </param>
		public override void SetText(System.String text)
		{
			Lexer lexer;
			TagNode output;
			
			lexer = new Lexer(text);
			try
			{
				output = (TagNode) lexer.NextNode();
				mPage = output.Page;
				nodeBegin = output.StartPosition;
				nodeEnd = output.EndPosition;
				mAttributes = output.AttributesEx;
			}
			catch (ParserException pe)
			{
				//UPGRADE_TODO: The equivalent in .NET for method 'java.lang.Throwable.getMessage' may return a different value. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1043'"
				throw new System.ArgumentException(pe.Message);
			}
		}

예제 #28

0

파일 보기

파일: CompositeTagScanner.cs 프로젝트: JamalAbuDayyeh/slowandsteadyparser

		/// <summary> Collect the children.
		/// <p>An initial test is performed for an empty XML tag, in which case
		/// the start tag and end tag of the returned tag are the same and it has
		/// no children.<p>
		/// If it's not an empty XML tag, the lexer is repeatedly asked for
		/// subsequent nodes until an end tag is found or a node is encountered
		/// that matches the tag ender set or end tag ender set.
		/// In the latter case, a virtual end tag is created.
		/// Each node found that is not the end tag is added to
		/// the list of children. The end tag is special and not a child.<p>
		/// Nodes that also have a CompositeTagScanner as their scanner are
		/// recursed into, which provides the nested structure of an HTML page.
		/// This method operates in two possible modes, depending on a private boolean.
		/// It can recurse on the JVM stack, which has caused some overflow problems
		/// in the past, or it can use the supplied stack argument to nest scanning
		/// of child tags within itself. The former is left as an option in the code,
		/// mostly to help subsequent modifiers visualize what the internal nesting
		/// is doing.
		/// </summary>
		/// <param name="tag">The tag this scanner is responsible for.
		/// </param>
		/// <param name="lexer">The source of subsequent nodes.
		/// </param>
		/// <param name="stack">The parse stack. May contain pending tags that enclose
		/// this tag.
		/// </param>
		/// <returns> The resultant tag (may be unchanged).
		/// </returns>
		public override ITag Scan(ITag tag, Lexer lexer, NodeList stack)
		{
			INode node;
			ITag next;
			System.String name;
			IScanner scanner;
			ITag ret;
			
			ret = tag;
			
			if (ret.EmptyXmlTag)
			{
				ret.SetEndTag(ret);
			}
			else
				do 
				{
					node = lexer.NextNode(false);
					if (null != node)
					{
						if (node is ITag)
						{
							next = (ITag) node;
							name = next.TagName;
							// check for normal end tag
							if (next.IsEndTag() && name.Equals(ret.TagName))
							{
								ret.SetEndTag(next);
								node = null;
							}
							else if (IsTagToBeEndedFor(ret, next))
								// check DTD
							{
								// backup one node. insert a virtual end tag later
								lexer.Position = next.StartPosition;
								node = null;
							}
							else if (!next.IsEndTag())
							{
								// now recurse if there is a scanner for this type of tag
								scanner = next.ThisScanner;
								if (null != scanner)
								{
									if (mUseJVMStack)
									{
										// JVM stack recursion
										node = scanner.Scan(next, lexer, stack);
										AddChild(ret, node);
									}
									else
									{
										// fake recursion:
										if (scanner == this)
										{
											if (next.EmptyXmlTag)
											{
												next.SetEndTag(next);
												FinishTag(next, lexer);
												AddChild(ret, next);
											}
											else
											{
												stack.Add(ret);
												ret = next;
											}
										}
										else
										{
											// normal recursion if switching scanners
											node = scanner.Scan(next, lexer, stack);
											AddChild(ret, node);
										}
									}
								}
								else
									AddChild(ret, next);
							}
							else
							{
								if (!mUseJVMStack && !mLeaveEnds)
								{
									// Since all non-end tags are consumed by the
									// previous clause, we're here because we have an
									// end tag with no opening tag... this could be bad.
									// There are two cases...
									// 1) The tag hasn't been registered, in which case
									// we just add it as a simple child, like it's
									// opening tag
									// 2) There may be an opening tag further up the
									// parse stack that needs closing.
									// So, we ask the factory for a node like this one
									// (since end tags never have scanners) and see
									// if it's scanner is a composite tag scanner.
									// If it is we walk up the parse stack looking for
									// something that needs this end tag to finish it.
									// If there is something, we close off all the tags
									// walked over and continue on as if nothing
									// happened.
									System.Collections.ArrayList attributes = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10));
									attributes.Add(new TagAttribute(name, null));
									ITag opener = lexer.NodeFactory.CreateTagNode(lexer.Page, next.StartPosition, next.EndPosition, attributes);
									
									scanner = opener.ThisScanner;
									if ((null != scanner) && (scanner == this))
									{
										// uh-oh
										int index = - 1;
										for (int i = stack.Size() - 1; (- 1 == index) && (i >= 0); i--)
										{
											// short circuit here... assume everything on the stack has this as it's scanner
											// we'll need to stop if either of those conditions isn't met
											ITag boffo = (ITag) stack.ElementAt(i);
											if (name.Equals(boffo.TagName))
												index = i;
											else if (IsTagToBeEndedFor(boffo, next))
												// check DTD
												index = i;
										}
										if (- 1 != index)
										{
											// finish off the current one first
											FinishTag(ret, lexer);
											AddChild((ITag) stack.ElementAt(stack.Size() - 1), ret);
											for (int i = stack.Size() - 1; i > index; i--)
											{
												ITag fred = (ITag) stack.Remove(i);
												FinishTag(fred, lexer);
												AddChild((ITag) stack.ElementAt(i - 1), fred);
											}
											ret = (ITag) stack.Remove(index);
											node = null;
										}
										else
											AddChild(ret, next); // default behaviour
									}
									else
										AddChild(ret, next); // default behaviour
								}
								else
									AddChild(ret, next);
							}
						}
						else
						{
							AddChild(ret, node);
							node.DoSemanticAction();
						}
					}
					
					if (!mUseJVMStack)
					{
						// handle coming out of fake recursion
						if (null == node)
						{
							int depth = stack.Size();
							if (0 != depth)
							{
								node = stack.ElementAt(depth - 1);
								if (node is ITag)
								{
									ITag precursor = (ITag) node;
									scanner = precursor.ThisScanner;
									if (scanner == this)
									{
										stack.Remove(depth - 1);
										FinishTag(ret, lexer);
										AddChild(precursor, ret);
										ret = precursor;
									}
									else
										node = null; // normal recursion
								}
								else
									node = null; // normal recursion
							}
						}
					}
				}
				while (null != node);
			
			FinishTag(ret, lexer);
			
			return (ret);
		}

예제 #29

0

파일 보기

파일: ScollGoalBLL.cs 프로젝트: opo30/bet-helper

        /// <summary>
        /// 获得列表
        /// </summary>
        /// <returns></returns>
        public List<OddsLiveMatch> GetScrollMatchList()
        {
            List<OddsLiveMatch> liveMatchList = new List<OddsLiveMatch>();
            try
            {
                HttpHelper h = new HttpHelper();
                Cookie lng = new Cookie("lng", "2");
                lng.Domain = domain;
                h.CookieContainer.Add(lng);
                string zoudi = h.GetHtml("https://" +domain+ "/default.aspx"+ zoudiUrl);
                if (!string.IsNullOrEmpty(zoudi))
                {
                    #region 分析网页html节点
                    Lexer lexer = new Lexer(zoudi);
                    Parser parser = new Parser(lexer);
                    NodeList bodyNodes = parser.Parse(new TagNameFilter("HTML"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("BODY"))[0].Children;
                    ITag divNode = bodyNodes.ExtractAllNodesThatMatch(new TagNameFilter("FORM"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("DIV"))[0] as ITag;
                    if (divNode.Attributes["ID"].Equals("PageBody"))
                    {
                        NodeList dataDivList = divNode.Children.SearchFor(typeof(Winista.Text.HtmlParser.Tags.Div));
                        if (dataDivList[0].ToPlainTextString() == "走地盤")
                        {
                            if (dataDivList[2].ToPlainTextString() == "全場賽果")
                            {
                                return liveMatchList;
                            }
                            for (int i = 0; i < dataDivList.Count; i++)
                            {
                                ITag div = dataDivList[i] as ITag;
                                if (div.Attributes["CLASS"] != null && div.Attributes["CLASS"].Equals("menuRow"))
                                {
                                    OddsLiveMatch oddsLive = new OddsLiveMatch();
                                    oddsLive.urlparams = (div.FirstChild as ITag).Attributes["HREF"].ToString();
                                    oddsLive.id = oddsLive.urlparams.Split('&')[0].Substring(4);
                                    oddsLive.time = DateTime.Now;
                                    oddsLive.name = div.ToPlainTextString();
                                    liveMatchList.Add(oddsLive);
                                }
                            }
                        }
                    }
                    #endregion 分析网页html节点
                }
            }
            catch (Exception)
            {

            }
            return liveMatchList;
        }

예제 #30

0

파일 보기

파일: SpiderRrxf.cs 프로젝트: wangsying/SpiderJobs

        public void ParseProduct(ATag a)
        {
            string html = GetHtml(a.Link);

            Lexer lexer = new Lexer(html);
            Parser parser = new Parser(lexer);

            NodeFilter productArea = new HasAttributeFilter("id", "productyou");
            NodeList nodes = parser.ExtractAllNodesThatMatch(productArea);

            ParseProductTitle(nodes);
            ParseProductShowPhoto(nodes);
            ParseProductDemoPhoto(nodes);
            ParsePorductDescribe(nodes);

            NodeFilter productAttributeArea = new HasAttributeFilter("class", "chans");
            NodeList productAttributeAreaNodes = nodes.ExtractAllNodesThatMatch(productAttributeArea,true);

            NodeFilter productAttributes = new HasAttributeFilter("class", "cph");
            NodeList productAttributeNodes = nodes.ExtractAllNodesThatMatch(productAttributes, true);

            int length = productAttributeNodes.Count;
            for (int i = 0; i < length; i++)
            {
                INode n = productAttributeNodes[i].Children[0];
                string t =n.ToPlainTextString();
                if (Regex.Match(t, @"^\s{0,}颜色", RegexOptions.IgnoreCase).Success)
                {
                    ParseProductColors(n);
                }
                Console.WriteLine();
            }
        }

C# (CSharp) Winista.Text.HtmlParser.Lex Lexer 예제들