/// <summary> /// 获取目标数据 /// </summary> /// <param name="parser">目标html文件</param> /// <param name="tag">标签名称</param> /// <param name="attribute">标签里面的属性名称</param> /// <param name="attValue">属性的值</param> /// <returns>标签内的目标数据</returns> public static string getValue(string html, string tag, string attribute, string attValue) { Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); string value = string.Empty; NodeFilter nodeFilter = new TagNameFilter(tag); NodeList nodeList = parser.Parse(nodeFilter); for (int i = 0; i < nodeList.Count; i++) { INode node = nodeList[i]; ITag tagNode = (node as ITag); if (tagNode.Attributes != null && tagNode.Attributes.Count > 0) { foreach (string key in tagNode.Attributes.Keys) { if (key.Contains("<TAGNAME>")) { continue; } if (key.Contains(attribute)) { if (tagNode.Attributes[key].ToString() == attValue) { value = tagNode.ToPlainTextString(); return value; } } } } } return null; }
static void GetBlogLink(string htmlContent) { Lexer lexer = new Lexer(htmlContent); Parser parser = new Parser(lexer); NodeList articleList = parser.Parse(articleFilter); if (articleList.Count == 1) { NodeList candidateNodeList = articleList[0].Children.ExtractAllNodesThatMatch(wrapFilter, true); for (int i = 0; i < candidateNodeList.Count; i++) { NodeList linkNodeList = candidateNodeList[i].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), false); if (linkNodeList.Count == 1) { string blogLink = ((ATag)linkNodeList[0]).ExtractLink(); blogLinkList.Add(blogLink); } else { Console.WriteLine("第" + i + "个条目中,判断链接出错!"); } } } else { Console.WriteLine("获取包含日志列表出错!"); } }
public static List<Product> LoadGoods(string html) { Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter filter = new HasAttributeFilter("class", "product"); NodeList products = parser.ExtractAllNodesThatMatch(filter); List<Product> result = new List<Product>(); for (int i = 0; i < products.Count; i++) { try { Product p = new Product(); string pname = "", ppromo = "", pimg = ""; decimal pprice = 0; ITag product = products[i] as ITag; //name NodeFilter nameFilter = new HasAttributeFilter("class", "product-title"); NodeList names = product.Children.ExtractAllNodesThatMatch(nameFilter, true); ITag name = names[0] as ITag; pname = name.ToPlainTextString().Trim(); //name NodeFilter priceFilter = new HasAttributeFilter("class", "product-price"); NodeList prices = product.Children.ExtractAllNodesThatMatch(priceFilter, true); ITag price = prices[0] as ITag; pprice = Decimal.Parse(price.ToPlainTextString().Trim().Substring(7)); //img NodeFilter imgFilter = new TagNameFilter("img"); NodeList imgs = product.Children.ExtractAllNodesThatMatch(imgFilter, true); ITag img = imgs[0] as ITag; pimg = img.GetAttribute("DATA-KS-LAZYLOAD"); //promo NodeFilter promoFilter = new HasAttributeFilter("class", "promo"); NodeList promos = product.Children.ExtractAllNodesThatMatch(promoFilter, true); if (promos.Count > 0) { ITag promo = promos[0] as ITag; ppromo = promo.GetAttribute("data-promo"); } p.img = pimg; p.name = pname; p.price = pprice; p.promo = ppromo; result.Add(p); } catch { } } return result; }
/// <summary> /// 增加一条数据 /// </summary> public string Add(string scheduleID, string companyids, string historyids,DateTime time) { WebClientBLL bll = new WebClientBLL(); string[] companyidArr = companyids.Split(','); string[] historyidArr = historyids.Split(','); int count = 0; if (companyidArr.Length == historyidArr.Length) { dal.Delete(scheduleID); for (int i = 0; i < companyidArr.Length; i++) { string s = bll.GetOddsHistoryContent(historyidArr[i]); Lexer lexer = new Lexer(s); Parser parser = new Parser(lexer); NodeList bodyNodes = parser.Parse(new TagNameFilter("HTML"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("BODY"))[0].Children; ITag table = bodyNodes.SearchFor(typeof(Winista.Text.HtmlParser.Tags.TableTag))[0] as ITag; NodeList tableRows = table.Children.SearchFor(typeof(Winista.Text.HtmlParser.Tags.TableRow)); for (int f = 0; f < tableRows.Count; f++) { ITag row = tableRows[f] as ITag; if (row.Attributes["ALIGN"].Equals("center") && row.Attributes["BGCOLOR"].Equals("#FFFFFF")){ Odds1x2History model = new Odds1x2History(); model.companyid = int.Parse(companyidArr[i]); model.scheduleid = int.Parse(scheduleID); model.home = float.Parse(row.Children[0].ToPlainTextString()); model.draw = float.Parse(row.Children[1].ToPlainTextString()); model.away = float.Parse(row.Children[2].ToPlainTextString()); this.FillOdds1x2History(model); string[] t2 = row.Children[3].ToPlainTextString().Replace("showtime(", "").Replace(")", "").Split(','); int yy = int.Parse(t2[0]); int mm = int.Parse(t2[1].Remove(2)); int dd = int.Parse(t2[2]); int hh = int.Parse(t2[3]); int mi = int.Parse(t2[4]); int ss = int.Parse(t2[5]); model.time = new DateTime(yy, mm, dd, hh, mi, ss, DateTimeKind.Utc).AddHours(8d); if (model.time > time) { continue; } dal.Add(model); count++; } } } } JSONHelper json = new JSONHelper(); json.success = true; json.totlalCount = count; return json.ToString(); }
static void GetStoryOfRevolution() { StreamReader reader = new StreamReader("catalogue.htm"); Lexer lexer = new Lexer(reader.ReadToEnd()); Parser parser = new Parser(lexer); HasAttributeFilter linkFilterByParent = new HasAttributeFilter("class", "row zhangjieUl"); HasAttributeFilter linkFilterByClass = new HasAttributeFilter("class", "fontStyle2 colorStyleLink"); AndFilter linkFilter = new AndFilter(new HasParentFilter(linkFilterByParent, true), linkFilterByClass); NodeList linkNodeList = parser.Parse(linkFilter); List<string> linkUrlList = new List<string>(linkNodeList.Size()); List<string> chapterHtmlContentList = new List<string>(linkNodeList.Size()); HttpWebRequest httpWebRequest; StreamReader chapterReader = null; for (int i = 0; i < linkNodeList.Size(); i++) { ATag linkNode = (ATag)linkNodeList[i]; linkUrlList.Add(linkNode.Link); httpWebRequest = HttpWebRequest.CreateHttp("http://www.mlxiaoshuo.com" + linkUrlList[linkUrlList.Count - 1]); chapterReader = new StreamReader(new BufferedStream(httpWebRequest.GetResponse().GetResponseStream(), 4 * 200 * 1024)); string chapterHtmlContent = chapterReader.ReadToEnd(); chapterHtmlContentList.Add(chapterHtmlContent); Console.WriteLine("第" + (i + 1) + "个页面获取完毕!"); } chapterReader.Close(); HasAttributeFilter praghFilter = new HasAttributeFilter("class", "textP fontStyle2 colorStyleText"); StreamWriter writer = new StreamWriter("革命逸事.txt"); for (int i = 0; i < chapterHtmlContentList.Count; i++) { writer.WriteLine("第" + (i + 1) + "章"); lexer = new Lexer(chapterHtmlContentList[i]); parser = new Parser(lexer); NodeList praghNodeList = parser.Parse(praghFilter); if (praghNodeList.Size() == 1) { for (int j = 0; j < praghNodeList[0].Children.Size(); j++) { if (praghNodeList[0].Children[j].GetType().Equals(typeof(ParagraphTag))) { ParagraphTag praghTag = (ParagraphTag)praghNodeList[0].Children[j]; writer.WriteLine(" " + praghTag.StringText); } } writer.WriteLine(); } else { Console.WriteLine("第" + (i + 1) + "页中,判断段落的标准出错!"); } } writer.Close(); }
public List<PlayTime> getPlayTimes(string xmlFile) { Match match = Regex.Match(xmlFile, @"\d\d\d\d"); string cinemaID = match.Value;//电影院的ID List<PlayTime> playTimes = new List<PlayTime>(); string html = File.ReadAllText(xmlFile); Lexer lexer = new Lexer(html); Parser playParser = new Parser(lexer); NodeFilter playFilter = new HasAttributeFilter("CLASS", "px14"); NodeList playTimeList = playParser.ExtractAllNodesThatMatch(playFilter); if (playTimeList.Count >= 1) { for (int i = 0; i < playTimeList.Count; i++) { PlayTime playTime = new PlayTime(); ITag playTag = (playTimeList[i] as ITag); ITag idTag = (playTag.FirstChild as ITag); if (idTag.Attributes != null) { string strID = idTag.Attributes["HREF"].ToString(); Match idMatch = Regex.Match(strID, @"\d\d\d\d\d\d"); if (idMatch.Success) { playTime.MovieID = int.Parse(idMatch.Value); } else { Match strMatch = Regex.Match(strID, @"\d\d\d\d\d"); if (strMatch.Success) { playTime.MovieID = int.Parse(strMatch.Value); } } } string strTime = playTag.NextSibling.NextSibling.ToPlainTextString(); char[] a = {'上','映'}; strTime = strTime.Trim(a); playTime.Playtime = DateTime.Parse(strTime); playTime.CinemaID = int.Parse(cinemaID); playTime.PlayState = true; playTimes.Add(playTime); } return playTimes; } return null; }
/// <summary> Scan for script. /// Accumulates text from the page, until </[a-zA-Z] is encountered. /// </summary> /// <param name="tag">The tag this scanner is responsible for. /// </param> /// <param name="lexer">The source of CDATA. /// </param> /// <param name="stack">The parse stack, <em>not used</em>. /// </param> public override ITag Scan(ITag tag, Lexer lexer, NodeList stack) { System.String language; System.String code; INode content; int position; INode node; TagAttribute attribute; System.Collections.ArrayList vector; if (tag is ScriptTag) { language = ((ScriptTag) tag).Language; if ((null != language) && (language.ToUpper().Equals("JScript.Encode".ToUpper()) || language.ToUpper().Equals("VBScript.Encode".ToUpper()))) { code = ScriptDecoder.Decode(lexer.Page, lexer.Cursor); ((ScriptTag) tag).ScriptCode = code; } } content = lexer.ParseCDATA(!STRICT); position = lexer.Position; node = lexer.NextNode(false); if (null != node) if (!(node is ITag) || !(((ITag) node).IsEndTag() && ((ITag) node).TagName.Equals(tag.Ids[0]))) { lexer.Position = position; node = null; } // build new end tag if required if (null == node) { attribute = new TagAttribute("/script", null); vector = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10)); vector.Add(attribute); node = lexer.NodeFactory.CreateTagNode(lexer.Page, position, position, vector); } tag.SetEndTag((ITag) node); if (null != content) { tag.Children = new NodeList(content); content.Parent = tag; } node.Parent = tag; tag.DoSemanticAction(); return (tag); }
static string GetBlogTitle(string htmlContent) { string result = ""; Lexer lexer = new Lexer(htmlContent); Parser parser = new Parser(lexer); NodeList titleList = parser.Parse(titleFilter); if (titleList.Count == 1) { TitleTag titleTag = (TitleTag)titleList[0]; result = titleTag.Title; } else { Console.WriteLine("获取标题信息出错!"); } return result; }
/// <summary> /// 获得列表 /// </summary> /// <returns></returns> public List<OddsLiveMatch> GetMatchScrollOdds(string matchid,string urlparams) { List<OddsLiveMatch> liveMatchList = new List<OddsLiveMatch>(); try { HttpHelper h = new HttpHelper(); Cookie lng = new Cookie("lng", "2"); lng.Domain = domain; h.CookieContainer.Add(lng); //string zoudi = h.GetHtml("https://" + domain + "/default.aspx" + urlparams); string zoudi = h.GetHtml(urlparams); if (!string.IsNullOrEmpty(zoudi)) { #region 分析网页html节点 Lexer lexer = new Lexer(zoudi); Parser parser = new Parser(lexer); NodeList bodyNodes = parser.Parse(new TagNameFilter("HTML"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("BODY"))[0].Children; ITag divNode = bodyNodes.ExtractAllNodesThatMatch(new TagNameFilter("FORM"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("DIV"))[0] as ITag; if (divNode.Attributes["ID"].Equals("PageBody")) { NodeList dataDivList = divNode.Children.SearchFor(typeof(Winista.Text.HtmlParser.Tags.Div)); if (dataDivList[0].ToPlainTextString() == "走地盤") { if (dataDivList[2].ToPlainTextString() == "全場賽果") { OddsLiveHistory liveHistory = new OddsLiveHistory(); liveHistory.matchid = matchid; liveHistory.home = float.Parse(dataDivList[3].ToPlainTextString().Split(' ')[0]); liveHistory.draw = float.Parse(dataDivList[5].ToPlainTextString().Split(' ')[0]); liveHistory.away = float.Parse(dataDivList[7].ToPlainTextString().Split(' ')[0]); liveHistory.time = DateTime.Now; dal.AddHistory(liveHistory); } } } #endregion 分析网页html节点 } } catch (Exception) { } return liveMatchList; }
public bool getCinemaFive(string html, out string dining, out string park, out string gameCenter, out string intro3D, out string introVIP) { //string vip = string.Empty; //string html = File.ReadAllText(url); dining = string.Empty; park = string.Empty; gameCenter = string.Empty; intro3D = string.Empty; introVIP = string.Empty; Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter nodeFilter = new HasAttributeFilter("CLASS", "c_000"); NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter); for (int i = 0; i < nodeList.Count; i++) { INode node = nodeList[i]; ITag tagPar = (node.Parent as ITag); ITag tagSib = (node.PreviousSibling as ITag); if (tagSib.Attributes["CLASS"] != null) { switch (tagSib.Attributes["CLASS"].ToString()) { case "ico_cside1 mr12": dining = tagPar.ToPlainTextString(); break; case "ico_cside2 mr12": park = tagPar.ToPlainTextString(); break; case "ico_cside3 mr12": gameCenter = tagPar.ToPlainTextString(); break; case "ico_cside5 mr12": intro3D = tagPar.ToPlainTextString(); break; case "ico_cside7 mr12": introVIP = tagPar.ToPlainTextString(); break; } } } return true; //throw new NotImplementedException(); }
public static string getAttValue(string html, string tag, string attribute, string attValue, string attributeV) { Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); string value = string.Empty; NodeFilter nodeFilter = new HasAttributeFilter(attribute, attValue); NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter); if (nodeList.Count == 1) { ITag tagNode = (nodeList[0] as ITag); if (tagNode.Attributes != null) { return tagNode.Attributes[attributeV].ToString(); } } //for (int i = 0; i < nodeList.Count; i++) //{ // INode node = nodeList[i]; // ITag tagNode = (node as ITag); // if (tagNode.Attributes != null && tagNode.Attributes.Count > 0) // { // foreach (string key in tagNode.Attributes.Keys) // { // if (key.Contains("<TAGNAME>")) // { // continue; // } // if (key.Contains(attribute)) // { // if (tagNode.Attributes[key].ToString() == attValue) // { // value = tagNode.Attributes[attributeV].ToString(); // return value; // } // } // } // } //} return null; }
/// <summary> Scan for style definitions. /// Accumulates text from the page, until </[a-zA-Z] is encountered. /// </summary> /// <param name="tag">The tag this scanner is responsible for. /// </param> /// <param name="lexer">The source of CDATA. /// </param> /// <param name="stack">The parse stack, <em>not used</em>. /// </param> public override ITag Scan(ITag tag, Lexer lexer, NodeList stack) { INode content; int position; INode node; TagAttribute attribute; System.Collections.ArrayList vector; content = lexer.ParseCDATA(); position = lexer.Position; node = lexer.NextNode(false); if (null != node) if (!(node is ITag) || !(((ITag) node).IsEndTag() && ((ITag) node).TagName.Equals(tag.Ids[0]))) { lexer.Position = position; node = null; } // build new end tag if required if (null == node) { attribute = new TagAttribute("/style", null); vector = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10)); vector.Add(attribute); node = lexer.NodeFactory.CreateTagNode(lexer.Page, position, position, vector); } tag.SetEndTag((ITag) node); if (null != content) { tag.Children = new NodeList(content); content.Parent = tag; } node.Parent = tag; tag.DoSemanticAction(); return (tag); }
public float getCinemaGrade(string html) { //string tag = "dd"; //string attribute = "CLASS"; //string attValue = "total"; //string left = Spider.getValue(html, tag, attribute, attValue); //string tag2 = "dd"; //string attribute2 = "CLASS"; //string attValue2 = "total2"; //string right = Spider.getValue(html, tag2, attribute2, attValue2); //string grade = left + right; ////return float.Parse(grade); //return 1.1f; //throw new NotImplementedException(); Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter nodeFilter = new HasAttributeFilter("CLASS", "point ml12 px18"); NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter); if (nodeList.Count == 1) { INode node = nodeList[0]; ITag tagLeft = (node.FirstChild as ITag); ITag tagRight = (node.LastChild as ITag); string left = tagLeft.ToPlainTextString(); string right = tagRight.ToPlainTextString(); string strGrade = left + right; return float.Parse(strGrade); } return 7.0f; }
public ATag ParseProductUrl(string html) { Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter filter = new LinkRegexFilter(@"lookcp\.php\?cpid\=\d{0,}"); NodeList alist = parser.Parse(filter); ATag a = alist[0] as ATag; a.Link = "http://rrxf.cn/product/" + a.Link; return a; }
/// <summary> Finish off a tag. /// Perhap add a virtual end tag. /// Set the end tag parent as this tag. /// Perform the semantic acton. /// </summary> /// <param name="tag">The tag to finish off. /// </param> /// <param name="lexer">A lexer positioned at the end of the tag. /// </param> protected internal virtual void FinishTag(ITag tag, Lexer lexer) { if (null == tag.GetEndTag()) { tag.SetEndTag(CreateVirtualEndTag(tag, lexer, lexer.Page, lexer.Cursor.Position)); } tag.GetEndTag().Parent = tag; tag.DoSemanticAction(); }
public void ParseProducts(ATag a) { string html = GetHtml(a.Link.Replace("../", "http://rrxf.cn/")); Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter nav = new HasAttributeFilter("class", "photoyi"); NodeList navNodes = parser.Parse(nav); if (navNodes == null) return; int length = navNodes.Count; for (int i = 0; i < length; i++) { ATag link = ParseProductUrl(navNodes[i].ToHtml()); Console.WriteLine(link.Link); ParseProduct(link); } }
/// <summary> Creates an end tag with the same name as the given tag.</summary> /// <param name="tag">The tag to end. /// </param> /// <param name="lexer">The object containg the node factory. /// </param> /// <param name="page">The page the tag is on (virtually). /// </param> /// <param name="position">The offset into the page at which the tag is to /// be anchored. /// </param> /// <returns> An end tag with the name '"/" + tag.getTagName()' and a start /// and end position at the given position. The fact these positions are /// equal may be used to distinguish it as a virtual tag later on. /// </returns> protected internal virtual ITag CreateVirtualEndTag(ITag tag, Lexer lexer, Page page, int position) { ITag ret; System.String name; System.Collections.ArrayList attributes; name = "/" + tag.RawTagName; attributes = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10)); attributes.Add(new TagAttribute(name, (System.String) null)); ret = lexer.NodeFactory.CreateTagNode(page, position, position, attributes); return (ret); }
public static List<string> getValues(string html, string tag, string attribute, string attValue) { Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); string value = string.Empty; List<string> values = new List<string>(); NodeFilter nodeFilter = new HasAttributeFilter(attribute, attValue); NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter); for (int i = 0; i < nodeList.Count; i++) { INode node = nodeList[i]; ITag tagNode = (node as ITag); if (tagNode.Attributes != null && tagNode.Attributes.Count > 0) { value = tagNode.ToPlainTextString(); values.Add(value); } } //for (int i = 0; i < nodeList.Count; i++) //{ // INode node = nodeList[i]; // ITag tagNode = (node as ITag); // if (tagNode.Attributes != null && tagNode.Attributes.Count > 0) // { // foreach (string key in tagNode.Attributes.Keys) // { // if (key.Contains("<TAGNAME>")) // { // continue; // } // if (key.Contains(attribute)) // { // if (tagNode.Attributes[key].ToString() == attValue) // { // value = tagNode.ToPlainTextString(); // values.Add(value); // } // } // } // } //} return values; }
public void GetFromWeb(IGetFromWebNotify notifier) { Directory.CreateDirectory(Config.ImagePath); if (notifier != null) notifier.Notity(String.Format("���� {0}", Config.Uri), 0.0f); WebClient webClient = new WebClient(); webClient.Encoding = Encoding.UTF8; String strHtml = webClient.DownloadString(Config.Uri); if (notifier != null) notifier.Notity("����html�ĵ�", 0.0f); Lexer lexer = new Lexer(strHtml); Parser parser = new Parser(lexer); AndFilter andFilter = new AndFilter(new NodeClassFilter(typeof(TableRow)), new OrFilter(new HasAttributeFilter("class", "even"), new HasAttributeFilter("class", "odd"))); NodeList htmlNodes = parser.ExtractAllNodesThatMatch(andFilter); lock (this) { m_Cards = new List<Card>(); foreach (INode node in htmlNodes.ToNodeArray()) { int iFiledIndex = 0; Card card = new Card(); foreach (INode subNode in node.Children.ToNodeArray()) { if (subNode is TextNode) { continue; } switch (iFiledIndex) { case 0: card.ID = Convert.ToInt32(subNode.FirstChild.GetText()); card.ImagePath = Path.Combine(Config.ImagePath, card.ID.ToString() + ".jpg"); break; case 1: card.Name = subNode.FirstChild.FirstChild.GetText(); break; case 2: StringHelper.FillCardLongInfo(subNode.FirstChild.GetText(), card); break; case 3: if (subNode.FirstChild != null) { card.ManaCost = subNode.FirstChild.GetText(); } else { card.ManaCost = String.Empty; } break; case 4: card.Rare = subNode.FirstChild.GetText(); break; } iFiledIndex++; } m_Cards.Add(card); } } XmlSerializer s = new XmlSerializer(typeof(List<Card>)); FileStream fstream = new FileStream(Config.CardsXml, FileMode.CreateNew); s.Serialize(fstream, m_Cards); fstream.Close(); foreach (Card card in m_Cards) { if (notifier != null) notifier.Notity(String.Format("��ȡ��Ƭ\"{0}\"��Ϣ", card.Name), 1.0f / m_Cards.Count); webClient.DownloadFile(Path.Combine(Config.BaseImageUri, card.ID.ToString() + ".jpg"), card.ImagePath); } }
static int OneDayOneBeauty(string date) { try { string htmlContent = ""; string url = oneDayOneBeautyBaseUrl + date + "/"; HttpWebRequest httpWebRequest = HttpWebRequest.CreateHttp(url); httpWebRequest.Method = "GET"; HttpWebResponse httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse(); if (httpWebResponse.StatusCode.Equals(HttpStatusCode.OK)) { StreamReader reader = new StreamReader(new BufferedStream(httpWebResponse.GetResponseStream(), 4 * 200 * 1024)); htmlContent = reader.ReadToEnd(); //调试代码 //StreamWriter fw = new StreamWriter("debug.html"); //fw.Write(htmlContent); //fw.Close(); //调试完毕 httpWebResponse.Close(); reader.Close(); } if (!htmlContent.Equals("")) { Lexer lexer = new Lexer(htmlContent); Parser parser = new Parser(lexer); parser.AnalyzePage(); NodeList divList = parser.ExtractAllNodesThatMatch(OneDayOneBeautyImgFilter); if (divList.Count == 0) { parser.AnalyzePage(); divList = parser.ExtractAllNodesThatMatch(OneDayOneBeautyImgFilter2); } for (int i = 0; i < divList.Count; i++) { ImageTag imgNode = (ImageTag)divList[i]; //2014年5月16日根据网页结构修改 GetPicUrlsFromBeautyPersonalPage(imgNode, i, 1); } return divList.Count; } else { Console.WriteLine("得到的HTML为空!"); return 0; } } catch (WebException e) { HttpWebResponse httpWebResponse = (HttpWebResponse)e.Response; if (httpWebResponse.StatusCode.Equals(HttpStatusCode.NotFound)) { Console.WriteLine("网页未找到!"); } else { Console.WriteLine("访问网页出错!状态码:" + httpWebResponse.StatusCode); } httpWebResponse.Close(); return 0; } }
/// <summary> /// 从网页版微博中获取微博信息 /// </summary> /// <param name="fansList">保存爬得的粉丝数组</param> public void GetInfoFromHtml(List<Fan> fansList) { Lexer lexer = new Lexer(currentHtmlContent); Parser parser = new Parser(lexer); //获取包含每条微博的div标记列表 NodeList fansNodeList = parser.Parse(fanFilter); for (int i = 0; i < fansNodeList.Size(); i++) { Fan fan = new Fan(); //获取包含一个粉丝的<li>标记 Bullet fanBullet = (Bullet)fansNodeList[i]; #region 获取该粉丝头像 NodeList fanPortraitNodeList = fanBullet.Children.ExtractAllNodesThatMatch(portraitFilter, true); if (fanPortraitNodeList.Size() == 1) { Div fanPortraitDiv = (Div)fanPortraitNodeList[0]; NodeList imgNodeList = fanPortraitDiv.Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ImageTag)), true); if (imgNodeList.Size() == 1) { ImageTag imgNode = (ImageTag)imgNodeList[0]; if (imgNode.Attributes.ContainsKey("SRC") && imgNode.Attributes.ContainsKey("ALT")) { string imgUrl = imgNode.GetAttribute("SRC"); string imgName = imgNode.GetAttribute("ALT"); fan.Name = imgName; WebClient wc = new WebClient();//使用WebClient是因为下载用户头像不用登录cookie wc.DownloadFileAsync(new Uri(imgUrl), @"portrait\" + imgName + ".jpg"); wc.DownloadFileCompleted += wc_DownloadFileCompleted; } else { Console.WriteLine("第" + i + "个粉丝中,<img>标记缺少必要的属性!"); } } else { Console.WriteLine("第" + i + "个粉丝中,获取img标记出错!"); } } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝头像的标准出错!"); } #endregion #region 获取该粉丝的关注数/粉丝数/微博数 NodeList fanConnectNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanConnectFilter, true); if (fanConnectNodeList.Size() == 1) { NodeList ATagList = fanConnectNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), true); if (ATagList.Size() == 3) { for (int j = 0; j < 3; j++) { ATag aTag = (ATag)ATagList[j]; switch (j) { case 0: if (aTag.Attributes.ContainsKey("HREF") && aTag.GetAttribute("HREF").Contains("follow")) { fan.FollowCount = Int32.Parse(aTag.StringText); } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝的关注数出错!"); } break; case 1: if (aTag.Attributes.ContainsKey("HREF") && aTag.GetAttribute("HREF").Contains("fans")) { fan.FansCount = Int32.Parse(aTag.StringText); } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝的粉丝数出错!"); } break; default: fan.FeedsCount = Int32.Parse(aTag.StringText); break; } } } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝关注数/粉丝数/微博数的数量出错!"); } } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝关注数/粉丝数/微博数的标准出错!"); } #endregion #region 获取该粉丝的简介信息 NodeList fanInfoNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanInfoFilter, true); if (fanInfoNodeList.Size() == 1) { //Console.WriteLine(fanInfoNodeList[0].Parent.ToHtml()); Div fanInfoDiv = (Div)fanInfoNodeList[0]; string intro = fanInfoDiv.StringText; if (intro.Substring(0, 2).Equals("简介")) { fan.Introduction = intro.Substring(3, intro.Length - 3).Replace("\n", " ").Replace("\t", " "); } } else { if (fanInfoNodeList.Size() == 0) { fan.Introduction = ""; } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝简介的标准出错!"); } } #endregion #region 获取该粉丝的UserID、地点和性别信息;校验该粉丝的用户名信息 NodeList fanLocationNodeList = fanBullet.Children.ExtractAllNodesThatMatch(fanNameFilter, true); if (fanLocationNodeList.Size() == 1) { //获取粉丝的UserID信息;校验该粉丝的用户名信息 NodeList aTagNodeList = fanLocationNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag)), true); if (aTagNodeList.Size() >= 1) { ATag nameNode = (ATag)aTagNodeList[0]; if (nameNode.Attributes.ContainsKey("USERCARD") && nameNode.Attributes.ContainsKey("HREF")) { //获取粉丝的UserID信息 string uidStr = nameNode.GetAttribute("USERCARD"); if (uidStr.Substring(0, 3).Equals("id=")) { fan.UserID = uidStr.Substring(3, uidStr.Length - 3); } //获取粉丝的微博链接 string linkUrl = nameNode.GetAttribute("HREF"); fan.LinkURL = "http://www.weibo.com" + linkUrl; } else { Console.WriteLine("第" + i + "个粉丝中,包含用户id和链接的<a>标记中缺少必要的属性!"); } //校验该粉丝的用户名信息 if (!nameNode.StringText.Equals(fan.Name)) { Console.WriteLine("第" + i + "个粉丝中,用户名与用户头像文字描述不一致!"); } } //获取粉丝的性别和地点信息 NodeList locationNodeList = fanLocationNodeList[0].Children.ExtractAllNodesThatMatch(new HasAttributeFilter("class", "addr"), true); if (locationNodeList.Size() == 1) { string locationStr = ""; for (int j = 0; j < locationNodeList[0].Children.Size(); j++) { INode node = locationNodeList[0].Children[j]; if (node.GetType().Equals(typeof(TextNode))) { TextNode tNode = (TextNode)node; locationStr += tNode.ToPlainTextString(); } if (node.GetType().Equals(typeof(TagNode))) { TagNode tNode = (TagNode)node; if (tNode.Attributes.ContainsKey("CLASS")) { if (tNode.GetAttribute("CLASS").Contains("female"))//必须先female,因为female中也含有male,如果male在前,则所有用户均符合该条件了= = { fan.Gender = "female"; } else { if (tNode.GetAttribute("CLASS").Contains("male")) { fan.Gender = "male"; } else { fan.Gender = "unknown"; Console.WriteLine("第" + i + "个粉丝性别不明!"); } } } } } fan.Location = locationStr.Trim(); } else { Console.WriteLine("第" + i + "个粉丝中,获取粉丝地点的标准出错!"); } } else { Console.WriteLine("第" + i + "个粉丝中,获取该粉丝的UserID、地点和性别信息的标准出错!"); } #endregion #region 获取该粉丝关注用户的方式 NodeList followMethodNodeList = fanBullet.Children.ExtractAllNodesThatMatch(followMethodFilter, true); if (followMethodNodeList.Size() == 1) { NodeList methodNodeList = followMethodNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag))); if (methodNodeList.Size() == 1) { ATag methodNode = (ATag)methodNodeList[0]; fan.FollowMethod = methodNode.StringText.Trim(); } else { Console.WriteLine("第" + i + "个粉丝中,获取该粉丝关注用户的方式的数量出错!"); } } else { Console.WriteLine("第" + i + "个粉丝中,获取该粉丝关注用户的方式的标准出错!"); } #endregion fansList.Add(fan); } }
public void GetInfoFromHtml(int currentPage) { Lexer lexer = new Lexer(currentHtml); Parser parser = new Parser(lexer); NodeList poiHeadList = parser.Parse(poiListFilter); if (poiHeadList.Count == 1) { NodeList poiNodeList = poiHeadList[0].Children.ExtractAllNodesThatMatch(poiFilter, false); int numCount = 0; for (int i = 0; i < poiNodeList.Count; i++) { POI poi = new POI(); DefinitionListBullet poiNode = (DefinitionListBullet)poiNodeList[i]; if (poiNode.TagName.Equals("DD")) { numCount++; poi.Page = currentPage; poi.Number = numCount; #region 获取口味、环境和服务评分,以及获取星级 NodeList tasteNodeList = poiNode.Children.ExtractAllNodesThatMatch(tasteFilter, true); NodeList environmentNodeList = poiNode.Children.ExtractAllNodesThatMatch(environmentFilter, true); NodeList serviceNodeList = poiNode.Children.ExtractAllNodesThatMatch(serviceFilter, true); if (tasteNodeList.Count == 1 && environmentNodeList.Count == 1 && serviceNodeList.Count == 1) { Span spanNode = (Span)tasteNodeList[0]; if (!spanNode.ToPlainTextString().Equals("-")) { poi.TasteRemark = Int32.Parse(spanNode.ToPlainTextString()); } spanNode = (Span)environmentNodeList[0]; if (!spanNode.ToPlainTextString().Equals("-")) { poi.EnvironmentRemark = Int32.Parse(spanNode.ToPlainTextString()); } spanNode = (Span)serviceNodeList[0]; if (!spanNode.ToPlainTextString().Equals("-")) { poi.ServiceRemark = Int32.Parse(spanNode.ToPlainTextString()); } #region 获取星级 INode rankNodeOfParent = spanNode.Parent.NextSibling.NextSibling; if (rankNodeOfParent.Children != null && rankNodeOfParent.Children.Count >= 1) { INode rankNodeCandidate = rankNodeOfParent.Children[0]; if (rankNodeCandidate.GetType().Equals(typeof(Span))) { Span rankNode = (Span)rankNodeCandidate; string rank = rankNode.GetAttribute("TITLE"); if (rank.Contains("五")) { poi.Rank = 5; } else { if (rank.Contains("四")) { poi.Rank = 4; } else { if (rank.Contains("三")) { poi.Rank = 3; } else { if (rank.Contains("二")) { poi.Rank = 2; } else { if (rank.Contains("一")) { poi.Rank = 1; } } } } } } } #endregion } else { Console.WriteLine("第" + i + "条POI中,判断口味、环境和服务的标准出错!"); } #endregion #region 获取平均消费 NodeList averageNodeList = poiNode.Children.ExtractAllNodesThatMatch(averageFilter, true); if (averageNodeList.Count == 1) { INode averageNode = averageNodeList[0]; if (averageNode.NextSibling.NextSibling.GetType().Equals(typeof(TextNode))) { string cost = ((TextNode)averageNode.NextSibling.NextSibling).ToPlainTextString(); poi.AverageCost = Int32.Parse(cost); } } else { Console.WriteLine("第" + i + "条POI中,判断平均消费的标准出错!"); } #endregion #region 获取点评数 NodeList commentNodeList = poiNode.Children.ExtractAllNodesThatMatch(commentFilter, true); if (commentNodeList.Count == 1) { INode commentNode = commentNodeList[0]; if (commentNode.GetType().Equals(typeof(ATag))) { string commentNum = ((ATag)commentNode).StringText; if (commentNum.Substring(commentNum.Length - 3, 3).Equals("封点评")) { commentNum = commentNum.Substring(0, commentNum.Length - 3); } poi.CommentCount = Int32.Parse(commentNum); } } else { Console.WriteLine("第" + i + "条POI中,判断点评数的标准出错!"); } #endregion #region 获取店名 NodeList nameNodeList = poiNode.Children.ExtractAllNodesThatMatch(nameFilter, true); if (nameNodeList.Count == 1) { INode nameNode = nameNodeList[0]; if (nameNode.GetType().Equals(typeof(ATag))) { poi.Name = ((ATag)nameNode).StringText; } } else { Console.WriteLine("第" + i + "条POI中,判断店名的标准出错!"); } #endregion #region 获取地址和电话 NodeList addressNodeList = poiNode.Children.ExtractAllNodesThatMatch(addressFilter, true); if (addressNodeList.Count == 1) { NodeList districtNodeList = addressNodeList[0].Children.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(ATag))); if (districtNodeList.Count == 1) { ATag districtTag = (ATag)districtNodeList[0]; string address = districtTag.ToPlainTextString(); if (districtTag.NextSibling.GetType().Equals(typeof(TextNode))) { TextNode detailAddressNode = (TextNode)districtTag.NextSibling; string detailAddress = detailAddressNode.ToPlainTextString(); detailAddress = detailAddress.Trim(); string phoneStr = detailAddress.Substring(detailAddress.Length - 8, 8); poi.Phone = phoneStr; address += detailAddress.Substring(0, detailAddress.Length - 8); } char[] removeChrVector = { ' ', '\n', '\t' }; address = address.Trim(removeChrVector); foreach (char c in removeChrVector) { address = address.Replace(c.ToString(), ""); } poi.Address = address; } else { Console.WriteLine("第" + i + "条POI中,判断含地址的<a>标记的标准出错!"); } } else { Console.WriteLine("第" + i + "条POI中,判断地址的标准出错!"); } #endregion #region 获取标签 NodeList tagsNodeList = poiNode.Children.ExtractAllNodesThatMatch(tagsFilter, true); if (tagsNodeList.Count == 1) { INode tagsNode = tagsNodeList[0]; if (tagsNode.Children != null) { for (int j = 0; j < tagsNode.Children.Count; j++) { INode node = tagsNode.Children[j]; if (node.GetType().Equals(typeof(ATag))) { poi.Tags.Add(node.ToPlainTextString()); } } } } else { Console.WriteLine("第" + i + "条POI中,判断标签的标准出错!"); } #endregion poiList.Add(poi); } } } else { Console.WriteLine("获取POI列表出错"); } }
/// <summary> Construct a parser using the provided lexer. /// A feedback object printing to {@link #STDOUT System.out} is used. /// This would be used to create a parser for special cases where the /// normal creation of a lexer on a URLConnection needs to be customized. /// </summary> /// <param name="lexer">The lexer to draw characters from. /// </param> public Parser(Lexer lexer):this(lexer, STDOUT) { }
/// <summary> Construct a parser using the provided lexer and feedback object. /// This would be used to create a parser for special cases where the /// normal creation of a lexer on a URLConnection needs to be customized. /// </summary> /// <param name="lexer">The lexer to draw characters from. /// </param> /// <param name="fb">The object to use when information, /// warning and error messages are produced. If <em>null</em> no feedback /// is provided. /// </param> public Parser(Lexer lexer, IParserFeedBack fb) { Feedback = fb; if (null == lexer) throw new System.ArgumentException("lexer cannot be null"); Lexer = lexer; NodeFactory = new PrototypicalNodeFactory(); }
static void BeautyFlow(int id) { HttpWebResponse httpWebResponse = null; try { string htmlContent = ""; string url = BeautyFlowBaseUrl + id + "/"; HttpWebRequest httpWebRequest = HttpWebRequest.CreateHttp(url); httpWebRequest.Method = "GET"; httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse(); if (httpWebResponse.StatusCode.Equals(HttpStatusCode.OK)) { StreamReader reader = new StreamReader(new BufferedStream(httpWebResponse.GetResponseStream(), 4 * 200 * 1024)); htmlContent = reader.ReadToEnd(); //调试代码 //StreamWriter fw = new StreamWriter("debug.html"); //fw.Write(htmlContent); //fw.Close(); //调试完毕 httpWebResponse.Close(); reader.Close(); } if (!htmlContent.Equals("")) { Console.WriteLine("第一个html读取完成!"); int startIndex = htmlContent.IndexOf("/girl/"); int endIndex = htmlContent.IndexOf("/", startIndex + 6) + 1; string beautyMorePicturesLink = "http://curator.im" + htmlContent.Substring(startIndex, endIndex - startIndex); //Console.WriteLine(beautyMorePicturesLink); string htmlContentTwo = ""; httpWebRequest = HttpWebRequest.CreateHttp(beautyMorePicturesLink); httpWebRequest.Method = "GET"; httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse(); if (httpWebResponse.StatusCode.Equals(HttpStatusCode.OK)) { StreamReader reader = new StreamReader(new BufferedStream(httpWebResponse.GetResponseStream(), 4 * 200 * 1024)); htmlContentTwo = reader.ReadToEnd(); httpWebResponse.Close(); reader.Close(); } Console.WriteLine("第二个html读取完成!"); Lexer lexer = new Lexer(htmlContentTwo); Parser parser = new Parser(lexer); parser.AnalyzePage(); NodeList divList = parser.ExtractAllNodesThatMatch(BeautyNameFilter); string beautyName = ""; if (divList.Count == 1) { beautyName = divList[0].ToPlainTextString(); endIndex = beautyName.IndexOf('|') - 1; beautyName = beautyName.Substring(0, endIndex); } else { Console.WriteLine("获取正妹名称出错! id=" + id); Console.Read(); return; } parser.AnalyzePage(); divList = parser.ExtractAllNodesThatMatch(BeautyFlowImgFilter); for (int i = 0; i < divList.Count; i++) { ImageTag imgNode = (ImageTag)divList[i]; GetPicUrlsFromBeautyPersonalPage(imgNode, i, 2); } } else { Console.WriteLine("得到的HTML为空!"); return; } } catch (Exception ex) { //if (httpWebResponse != null) //{ // httpWebResponse = (HttpWebResponse)ex.Response; // if (!httpWebResponse.StatusCode.Equals(HttpStatusCode.NotFound)) // { // Console.WriteLine("访问网页出错!状态码:" + httpWebResponse.StatusCode); // } // httpWebResponse.Close(); //} } }
public List<ATag> ParseCatelog(string html) { List<ATag> atags = new List<ATag>(); Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter nav = new HasAttributeFilter("class", "fenlei_list"); NodeList navNodes = parser.Parse(nav); NodeFilter catelog = new LinkRegexFilter(@"^\.\./product/index\.php\?cplm\=\-\d\d\d\-$"); catelog = new HasChildFilter(catelog); NodeList catelogNodes = navNodes[0].Children.ExtractAllNodesThatMatch(catelog); if(catelogNodes==null){ return atags; } int length = catelogNodes.Count; for (int i=0;i<length;i++) { INode node = catelogNodes[i]; ATag a = node.Children[0] as ATag; atags.Add(a); } return atags; }
/// <summary> Parses the given text to create the tag contents.</summary> /// <param name="text">A string of the form <TAGNAME xx="yy">. /// </param> public override void SetText(System.String text) { Lexer lexer; TagNode output; lexer = new Lexer(text); try { output = (TagNode) lexer.NextNode(); mPage = output.Page; nodeBegin = output.StartPosition; nodeEnd = output.EndPosition; mAttributes = output.AttributesEx; } catch (ParserException pe) { //UPGRADE_TODO: The equivalent in .NET for method 'java.lang.Throwable.getMessage' may return a different value. "ms-help://MS.VSCC.v80/dv_commoner/local/redirect.htm?index='!DefaultContextWindowIndex'&keyword='jlca1043'" throw new System.ArgumentException(pe.Message); } }
/// <summary> Collect the children. /// <p>An initial test is performed for an empty XML tag, in which case /// the start tag and end tag of the returned tag are the same and it has /// no children.<p> /// If it's not an empty XML tag, the lexer is repeatedly asked for /// subsequent nodes until an end tag is found or a node is encountered /// that matches the tag ender set or end tag ender set. /// In the latter case, a virtual end tag is created. /// Each node found that is not the end tag is added to /// the list of children. The end tag is special and not a child.<p> /// Nodes that also have a CompositeTagScanner as their scanner are /// recursed into, which provides the nested structure of an HTML page. /// This method operates in two possible modes, depending on a private boolean. /// It can recurse on the JVM stack, which has caused some overflow problems /// in the past, or it can use the supplied stack argument to nest scanning /// of child tags within itself. The former is left as an option in the code, /// mostly to help subsequent modifiers visualize what the internal nesting /// is doing. /// </summary> /// <param name="tag">The tag this scanner is responsible for. /// </param> /// <param name="lexer">The source of subsequent nodes. /// </param> /// <param name="stack">The parse stack. May contain pending tags that enclose /// this tag. /// </param> /// <returns> The resultant tag (may be unchanged). /// </returns> public override ITag Scan(ITag tag, Lexer lexer, NodeList stack) { INode node; ITag next; System.String name; IScanner scanner; ITag ret; ret = tag; if (ret.EmptyXmlTag) { ret.SetEndTag(ret); } else do { node = lexer.NextNode(false); if (null != node) { if (node is ITag) { next = (ITag) node; name = next.TagName; // check for normal end tag if (next.IsEndTag() && name.Equals(ret.TagName)) { ret.SetEndTag(next); node = null; } else if (IsTagToBeEndedFor(ret, next)) // check DTD { // backup one node. insert a virtual end tag later lexer.Position = next.StartPosition; node = null; } else if (!next.IsEndTag()) { // now recurse if there is a scanner for this type of tag scanner = next.ThisScanner; if (null != scanner) { if (mUseJVMStack) { // JVM stack recursion node = scanner.Scan(next, lexer, stack); AddChild(ret, node); } else { // fake recursion: if (scanner == this) { if (next.EmptyXmlTag) { next.SetEndTag(next); FinishTag(next, lexer); AddChild(ret, next); } else { stack.Add(ret); ret = next; } } else { // normal recursion if switching scanners node = scanner.Scan(next, lexer, stack); AddChild(ret, node); } } } else AddChild(ret, next); } else { if (!mUseJVMStack && !mLeaveEnds) { // Since all non-end tags are consumed by the // previous clause, we're here because we have an // end tag with no opening tag... this could be bad. // There are two cases... // 1) The tag hasn't been registered, in which case // we just add it as a simple child, like it's // opening tag // 2) There may be an opening tag further up the // parse stack that needs closing. // So, we ask the factory for a node like this one // (since end tags never have scanners) and see // if it's scanner is a composite tag scanner. // If it is we walk up the parse stack looking for // something that needs this end tag to finish it. // If there is something, we close off all the tags // walked over and continue on as if nothing // happened. System.Collections.ArrayList attributes = System.Collections.ArrayList.Synchronized(new System.Collections.ArrayList(10)); attributes.Add(new TagAttribute(name, null)); ITag opener = lexer.NodeFactory.CreateTagNode(lexer.Page, next.StartPosition, next.EndPosition, attributes); scanner = opener.ThisScanner; if ((null != scanner) && (scanner == this)) { // uh-oh int index = - 1; for (int i = stack.Size() - 1; (- 1 == index) && (i >= 0); i--) { // short circuit here... assume everything on the stack has this as it's scanner // we'll need to stop if either of those conditions isn't met ITag boffo = (ITag) stack.ElementAt(i); if (name.Equals(boffo.TagName)) index = i; else if (IsTagToBeEndedFor(boffo, next)) // check DTD index = i; } if (- 1 != index) { // finish off the current one first FinishTag(ret, lexer); AddChild((ITag) stack.ElementAt(stack.Size() - 1), ret); for (int i = stack.Size() - 1; i > index; i--) { ITag fred = (ITag) stack.Remove(i); FinishTag(fred, lexer); AddChild((ITag) stack.ElementAt(i - 1), fred); } ret = (ITag) stack.Remove(index); node = null; } else AddChild(ret, next); // default behaviour } else AddChild(ret, next); // default behaviour } else AddChild(ret, next); } } else { AddChild(ret, node); node.DoSemanticAction(); } } if (!mUseJVMStack) { // handle coming out of fake recursion if (null == node) { int depth = stack.Size(); if (0 != depth) { node = stack.ElementAt(depth - 1); if (node is ITag) { ITag precursor = (ITag) node; scanner = precursor.ThisScanner; if (scanner == this) { stack.Remove(depth - 1); FinishTag(ret, lexer); AddChild(precursor, ret); ret = precursor; } else node = null; // normal recursion } else node = null; // normal recursion } } } } while (null != node); FinishTag(ret, lexer); return (ret); }
/// <summary> /// 获得列表 /// </summary> /// <returns></returns> public List<OddsLiveMatch> GetScrollMatchList() { List<OddsLiveMatch> liveMatchList = new List<OddsLiveMatch>(); try { HttpHelper h = new HttpHelper(); Cookie lng = new Cookie("lng", "2"); lng.Domain = domain; h.CookieContainer.Add(lng); string zoudi = h.GetHtml("https://" +domain+ "/default.aspx"+ zoudiUrl); if (!string.IsNullOrEmpty(zoudi)) { #region 分析网页html节点 Lexer lexer = new Lexer(zoudi); Parser parser = new Parser(lexer); NodeList bodyNodes = parser.Parse(new TagNameFilter("HTML"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("BODY"))[0].Children; ITag divNode = bodyNodes.ExtractAllNodesThatMatch(new TagNameFilter("FORM"))[0].Children.ExtractAllNodesThatMatch(new TagNameFilter("DIV"))[0] as ITag; if (divNode.Attributes["ID"].Equals("PageBody")) { NodeList dataDivList = divNode.Children.SearchFor(typeof(Winista.Text.HtmlParser.Tags.Div)); if (dataDivList[0].ToPlainTextString() == "走地盤") { if (dataDivList[2].ToPlainTextString() == "全場賽果") { return liveMatchList; } for (int i = 0; i < dataDivList.Count; i++) { ITag div = dataDivList[i] as ITag; if (div.Attributes["CLASS"] != null && div.Attributes["CLASS"].Equals("menuRow")) { OddsLiveMatch oddsLive = new OddsLiveMatch(); oddsLive.urlparams = (div.FirstChild as ITag).Attributes["HREF"].ToString(); oddsLive.id = oddsLive.urlparams.Split('&')[0].Substring(4); oddsLive.time = DateTime.Now; oddsLive.name = div.ToPlainTextString(); liveMatchList.Add(oddsLive); } } } } #endregion 分析网页html节点 } } catch (Exception) { } return liveMatchList; }
public void ParseProduct(ATag a) { string html = GetHtml(a.Link); Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter productArea = new HasAttributeFilter("id", "productyou"); NodeList nodes = parser.ExtractAllNodesThatMatch(productArea); ParseProductTitle(nodes); ParseProductShowPhoto(nodes); ParseProductDemoPhoto(nodes); ParsePorductDescribe(nodes); NodeFilter productAttributeArea = new HasAttributeFilter("class", "chans"); NodeList productAttributeAreaNodes = nodes.ExtractAllNodesThatMatch(productAttributeArea,true); NodeFilter productAttributes = new HasAttributeFilter("class", "cph"); NodeList productAttributeNodes = nodes.ExtractAllNodesThatMatch(productAttributes, true); int length = productAttributeNodes.Count; for (int i = 0; i < length; i++) { INode n = productAttributeNodes[i].Children[0]; string t =n.ToPlainTextString(); if (Regex.Match(t, @"^\s{0,}颜色", RegexOptions.IgnoreCase).Success) { ParseProductColors(n); } Console.WriteLine(); } }