public static string getAttValue(string html, string tag, string attribute, string attValue,string attributeV) { Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); string value = string.Empty; NodeFilter nodeFilter = new TagNameFilter(tag); NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter); for (int i = 0; i < nodeList.Count; i++) { INode node = nodeList[i]; ITag tagNode = (node as ITag); if (tagNode.Attributes != null && tagNode.Attributes.Count > 0) { foreach (string key in tagNode.Attributes.Keys) { if (key.Contains("<TAGNAME>")) { continue; } if (key.Contains(attribute)) { if (tagNode.Attributes[key].ToString() == attValue) { value = tagNode.Attributes[attributeV].ToString(); return value; } } } } } return null; }
public static List<Product> LoadGoods(string html) { Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter filter = new HasAttributeFilter("class", "product"); NodeList products = parser.ExtractAllNodesThatMatch(filter); List<Product> result = new List<Product>(); for (int i = 0; i < products.Count; i++) { try { Product p = new Product(); string pname = "", ppromo = "", pimg = ""; decimal pprice = 0; ITag product = products[i] as ITag; //name NodeFilter nameFilter = new HasAttributeFilter("class", "product-title"); NodeList names = product.Children.ExtractAllNodesThatMatch(nameFilter, true); ITag name = names[0] as ITag; pname = name.ToPlainTextString().Trim(); //name NodeFilter priceFilter = new HasAttributeFilter("class", "product-price"); NodeList prices = product.Children.ExtractAllNodesThatMatch(priceFilter, true); ITag price = prices[0] as ITag; pprice = Decimal.Parse(price.ToPlainTextString().Trim().Substring(7)); //img NodeFilter imgFilter = new TagNameFilter("img"); NodeList imgs = product.Children.ExtractAllNodesThatMatch(imgFilter, true); ITag img = imgs[0] as ITag; pimg = img.GetAttribute("DATA-KS-LAZYLOAD"); //promo NodeFilter promoFilter = new HasAttributeFilter("class", "promo"); NodeList promos = product.Children.ExtractAllNodesThatMatch(promoFilter, true); if (promos.Count > 0) { ITag promo = promos[0] as ITag; ppromo = promo.GetAttribute("data-promo"); } p.img = pimg; p.name = pname; p.price = pprice; p.promo = ppromo; result.Add(p); } catch { } } return result; }
public Job GetJobInfoParser(string url) { Job jobinfo = new Job(); string title = string.Empty; string description = string.Empty; DateTime dt = DateTime.Now; string email = string.Empty; Parser parser = new Parser(new HttpProtocol(new Uri(url))); NodeFilter detail = new HasAttributeFilter("class", "d_left"); NodeList nodeDetail = parser.ExtractAllNodesThatMatch(detail); if (nodeDetail == null || nodeDetail.Count == 0) { return jobinfo; } description = GetDetailString(nodeDetail); Match m = Regex.Match(description, @"发布时间:(?<date>\d\d\d\d-\d{1,2}\-\d{1,2} \d{1,2}\:\d{1,2})"); dt = DateTime.Now; if (m.Success && m.Groups["date"].Success && DateTime.TryParse(m.Groups["date"].Value, out dt)) { } Match emailMatch = Regex.Match(description, @"([a-zA-Z0-9_\-\.]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?)"); if (emailMatch.Success) { email = emailMatch.Value; } Match telMatch = Regex.Match(description, @"(1[3|5|8][0-9]|15[0|3|6|7|8|9]|18[8|9])\d{8}"); if (telMatch.Success) { jobinfo.tel = telMatch.Value; } jobinfo.category_id = Catalog.id; jobinfo.title = title; jobinfo.description = description; jobinfo.created_on = dt; jobinfo.is_active = true; jobinfo.city_id = Catalog.city.id; jobinfo.sp1010url = url; jobinfo.poster_email = email; return jobinfo; }
public List<PlayTime> getPlayTimes(string xmlFile) { Match match = Regex.Match(xmlFile, @"\d\d\d\d"); string cinemaID = match.Value;//电影院的ID List<PlayTime> playTimes = new List<PlayTime>(); string html = File.ReadAllText(xmlFile); Lexer lexer = new Lexer(html); Parser playParser = new Parser(lexer); NodeFilter playFilter = new HasAttributeFilter("CLASS", "px14"); NodeList playTimeList = playParser.ExtractAllNodesThatMatch(playFilter); if (playTimeList.Count >= 1) { for (int i = 0; i < playTimeList.Count; i++) { PlayTime playTime = new PlayTime(); ITag playTag = (playTimeList[i] as ITag); ITag idTag = (playTag.FirstChild as ITag); if (idTag.Attributes != null) { string strID = idTag.Attributes["HREF"].ToString(); Match idMatch = Regex.Match(strID, @"\d\d\d\d\d\d"); if (idMatch.Success) { playTime.MovieID = int.Parse(idMatch.Value); } else { Match strMatch = Regex.Match(strID, @"\d\d\d\d\d"); if (strMatch.Success) { playTime.MovieID = int.Parse(strMatch.Value); } } } string strTime = playTag.NextSibling.NextSibling.ToPlainTextString(); char[] a = {'上','映'}; strTime = strTime.Trim(a); playTime.Playtime = DateTime.Parse(strTime); playTime.CinemaID = int.Parse(cinemaID); playTime.PlayState = true; playTimes.Add(playTime); } return playTimes; } return null; }
public bool getCinemaFive(string html, out string dining, out string park, out string gameCenter, out string intro3D, out string introVIP) { //string vip = string.Empty; //string html = File.ReadAllText(url); dining = string.Empty; park = string.Empty; gameCenter = string.Empty; intro3D = string.Empty; introVIP = string.Empty; Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter nodeFilter = new HasAttributeFilter("CLASS", "c_000"); NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter); for (int i = 0; i < nodeList.Count; i++) { INode node = nodeList[i]; ITag tagPar = (node.Parent as ITag); ITag tagSib = (node.PreviousSibling as ITag); if (tagSib.Attributes["CLASS"] != null) { switch (tagSib.Attributes["CLASS"].ToString()) { case "ico_cside1 mr12": dining = tagPar.ToPlainTextString(); break; case "ico_cside2 mr12": park = tagPar.ToPlainTextString(); break; case "ico_cside3 mr12": gameCenter = tagPar.ToPlainTextString(); break; case "ico_cside5 mr12": intro3D = tagPar.ToPlainTextString(); break; case "ico_cside7 mr12": introVIP = tagPar.ToPlainTextString(); break; } } } return true; //throw new NotImplementedException(); }
public static string getAttValue(string html, string tag, string attribute, string attValue, string attributeV) { Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); string value = string.Empty; NodeFilter nodeFilter = new HasAttributeFilter(attribute, attValue); NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter); if (nodeList.Count == 1) { ITag tagNode = (nodeList[0] as ITag); if (tagNode.Attributes != null) { return tagNode.Attributes[attributeV].ToString(); } } //for (int i = 0; i < nodeList.Count; i++) //{ // INode node = nodeList[i]; // ITag tagNode = (node as ITag); // if (tagNode.Attributes != null && tagNode.Attributes.Count > 0) // { // foreach (string key in tagNode.Attributes.Keys) // { // if (key.Contains("<TAGNAME>")) // { // continue; // } // if (key.Contains(attribute)) // { // if (tagNode.Attributes[key].ToString() == attValue) // { // value = tagNode.Attributes[attributeV].ToString(); // return value; // } // } // } // } //} return null; }
public static List<string> getValues(string html, string tag, string attribute, string attValue) { Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); string value = string.Empty; List<string> values = new List<string>(); NodeFilter nodeFilter = new HasAttributeFilter(attribute, attValue); NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter); for (int i = 0; i < nodeList.Count; i++) { INode node = nodeList[i]; ITag tagNode = (node as ITag); if (tagNode.Attributes != null && tagNode.Attributes.Count > 0) { value = tagNode.ToPlainTextString(); values.Add(value); } } //for (int i = 0; i < nodeList.Count; i++) //{ // INode node = nodeList[i]; // ITag tagNode = (node as ITag); // if (tagNode.Attributes != null && tagNode.Attributes.Count > 0) // { // foreach (string key in tagNode.Attributes.Keys) // { // if (key.Contains("<TAGNAME>")) // { // continue; // } // if (key.Contains(attribute)) // { // if (tagNode.Attributes[key].ToString() == attValue) // { // value = tagNode.ToPlainTextString(); // values.Add(value); // } // } // } // } //} return values; }
public void GetFromWeb(IGetFromWebNotify notifier) { Directory.CreateDirectory(Config.ImagePath); if (notifier != null) notifier.Notity(String.Format("���� {0}", Config.Uri), 0.0f); WebClient webClient = new WebClient(); webClient.Encoding = Encoding.UTF8; String strHtml = webClient.DownloadString(Config.Uri); if (notifier != null) notifier.Notity("����html�ĵ�", 0.0f); Lexer lexer = new Lexer(strHtml); Parser parser = new Parser(lexer); AndFilter andFilter = new AndFilter(new NodeClassFilter(typeof(TableRow)), new OrFilter(new HasAttributeFilter("class", "even"), new HasAttributeFilter("class", "odd"))); NodeList htmlNodes = parser.ExtractAllNodesThatMatch(andFilter); lock (this) { m_Cards = new List<Card>(); foreach (INode node in htmlNodes.ToNodeArray()) { int iFiledIndex = 0; Card card = new Card(); foreach (INode subNode in node.Children.ToNodeArray()) { if (subNode is TextNode) { continue; } switch (iFiledIndex) { case 0: card.ID = Convert.ToInt32(subNode.FirstChild.GetText()); card.ImagePath = Path.Combine(Config.ImagePath, card.ID.ToString() + ".jpg"); break; case 1: card.Name = subNode.FirstChild.FirstChild.GetText(); break; case 2: StringHelper.FillCardLongInfo(subNode.FirstChild.GetText(), card); break; case 3: if (subNode.FirstChild != null) { card.ManaCost = subNode.FirstChild.GetText(); } else { card.ManaCost = String.Empty; } break; case 4: card.Rare = subNode.FirstChild.GetText(); break; } iFiledIndex++; } m_Cards.Add(card); } } XmlSerializer s = new XmlSerializer(typeof(List<Card>)); FileStream fstream = new FileStream(Config.CardsXml, FileMode.CreateNew); s.Serialize(fstream, m_Cards); fstream.Close(); foreach (Card card in m_Cards) { if (notifier != null) notifier.Notity(String.Format("��ȡ��Ƭ\"{0}\"��Ϣ", card.Name), 1.0f / m_Cards.Count); webClient.DownloadFile(Path.Combine(Config.BaseImageUri, card.ID.ToString() + ".jpg"), card.ImagePath); } }
static int OneDayOneBeauty(string date) { try { string htmlContent = ""; string url = oneDayOneBeautyBaseUrl + date + "/"; HttpWebRequest httpWebRequest = HttpWebRequest.CreateHttp(url); httpWebRequest.Method = "GET"; HttpWebResponse httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse(); if (httpWebResponse.StatusCode.Equals(HttpStatusCode.OK)) { StreamReader reader = new StreamReader(new BufferedStream(httpWebResponse.GetResponseStream(), 4 * 200 * 1024)); htmlContent = reader.ReadToEnd(); //调试代码 //StreamWriter fw = new StreamWriter("debug.html"); //fw.Write(htmlContent); //fw.Close(); //调试完毕 httpWebResponse.Close(); reader.Close(); } if (!htmlContent.Equals("")) { Lexer lexer = new Lexer(htmlContent); Parser parser = new Parser(lexer); parser.AnalyzePage(); NodeList divList = parser.ExtractAllNodesThatMatch(OneDayOneBeautyImgFilter); if (divList.Count == 0) { parser.AnalyzePage(); divList = parser.ExtractAllNodesThatMatch(OneDayOneBeautyImgFilter2); } for (int i = 0; i < divList.Count; i++) { ImageTag imgNode = (ImageTag)divList[i]; //2014年5月16日根据网页结构修改 GetPicUrlsFromBeautyPersonalPage(imgNode, i, 1); } return divList.Count; } else { Console.WriteLine("得到的HTML为空!"); return 0; } } catch (WebException e) { HttpWebResponse httpWebResponse = (HttpWebResponse)e.Response; if (httpWebResponse.StatusCode.Equals(HttpStatusCode.NotFound)) { Console.WriteLine("网页未找到!"); } else { Console.WriteLine("访问网页出错!状态码:" + httpWebResponse.StatusCode); } httpWebResponse.Close(); return 0; } }
static void BeautyFlow(int id) { HttpWebResponse httpWebResponse = null; try { string htmlContent = ""; string url = BeautyFlowBaseUrl + id + "/"; HttpWebRequest httpWebRequest = HttpWebRequest.CreateHttp(url); httpWebRequest.Method = "GET"; httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse(); if (httpWebResponse.StatusCode.Equals(HttpStatusCode.OK)) { StreamReader reader = new StreamReader(new BufferedStream(httpWebResponse.GetResponseStream(), 4 * 200 * 1024)); htmlContent = reader.ReadToEnd(); //调试代码 //StreamWriter fw = new StreamWriter("debug.html"); //fw.Write(htmlContent); //fw.Close(); //调试完毕 httpWebResponse.Close(); reader.Close(); } if (!htmlContent.Equals("")) { Console.WriteLine("第一个html读取完成!"); int startIndex = htmlContent.IndexOf("/girl/"); int endIndex = htmlContent.IndexOf("/", startIndex + 6) + 1; string beautyMorePicturesLink = "http://curator.im" + htmlContent.Substring(startIndex, endIndex - startIndex); //Console.WriteLine(beautyMorePicturesLink); string htmlContentTwo = ""; httpWebRequest = HttpWebRequest.CreateHttp(beautyMorePicturesLink); httpWebRequest.Method = "GET"; httpWebResponse = (HttpWebResponse)httpWebRequest.GetResponse(); if (httpWebResponse.StatusCode.Equals(HttpStatusCode.OK)) { StreamReader reader = new StreamReader(new BufferedStream(httpWebResponse.GetResponseStream(), 4 * 200 * 1024)); htmlContentTwo = reader.ReadToEnd(); httpWebResponse.Close(); reader.Close(); } Console.WriteLine("第二个html读取完成!"); Lexer lexer = new Lexer(htmlContentTwo); Parser parser = new Parser(lexer); parser.AnalyzePage(); NodeList divList = parser.ExtractAllNodesThatMatch(BeautyNameFilter); string beautyName = ""; if (divList.Count == 1) { beautyName = divList[0].ToPlainTextString(); endIndex = beautyName.IndexOf('|') - 1; beautyName = beautyName.Substring(0, endIndex); } else { Console.WriteLine("获取正妹名称出错! id=" + id); Console.Read(); return; } parser.AnalyzePage(); divList = parser.ExtractAllNodesThatMatch(BeautyFlowImgFilter); for (int i = 0; i < divList.Count; i++) { ImageTag imgNode = (ImageTag)divList[i]; GetPicUrlsFromBeautyPersonalPage(imgNode, i, 2); } } else { Console.WriteLine("得到的HTML为空!"); return; } } catch (Exception ex) { //if (httpWebResponse != null) //{ // httpWebResponse = (HttpWebResponse)ex.Response; // if (!httpWebResponse.StatusCode.Equals(HttpStatusCode.NotFound)) // { // Console.WriteLine("访问网页出错!状态码:" + httpWebResponse.StatusCode); // } // httpWebResponse.Close(); //} } }
public void SpiderCurrentPage(int idx) { ParserConf.GetConfiguration().RootPath = AppDomain.CurrentDomain.BaseDirectory; string url = Catalog.sp1010 + string.Format("index{0}.html", idx); Parser parser; NodeList nodeList=null; int count = 0; bool sign=true; while (sign && count<5) { SpiderEventLog.WriteSourceLog("Spider " + url, url, EventLogEntryType.Information); try { parser = new Parser(new HttpProtocol(new Uri(url))); } catch (Exception ex) { SpiderEventLog.WriteWarningLog("获取列表页面数据错误:" + url + "\r\n" + ex.ToString()); return; } if (parser == null) { return; } sign = false; try { NodeFilter filter = new HasAttributeFilter("class", "Linklist"); nodeList = parser.ExtractAllNodesThatMatch(filter); } catch (Exception ex) { SpiderEventLog.WriteWarningLog("获取列表页面数据错误:" + url + "\r\n" + ex.ToString()); sign = true; } count++; } if (nodeList == null) { return; } int length = nodeList.Count; for (int i = 0; i < length; i++) { ATag node = nodeList[i] as ATag; if (IsExistJob(node.Link)) { SpiderEventLog.WriteLog(string.Format("职务 [{0}] 已存在",node.LinkText)); continue; } Job jobinfo = GetJobInfoParser(node.Link); jobinfo.title = Regex.Replace(node.LinkText,"&[^&;]{0,};", "",RegexOptions.IgnoreCase); ConsoleColor color = Console.ForegroundColor; Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("=".PadLeft(120,'=')); Console.WriteLine("title:{0}", jobinfo.title); Console.WriteLine("url:{0}", jobinfo.sp1010url); Console.WriteLine("tel:{0}", jobinfo.tel); Console.WriteLine("email:{0}", jobinfo.poster_email); Console.WriteLine("desc:{0}", jobinfo.description); Console.WriteLine("=".PadLeft(120,'=')); Console.ForegroundColor = color; InsertJobInfo(jobinfo); } }
public float getCinemaGrade(string html) { //string tag = "dd"; //string attribute = "CLASS"; //string attValue = "total"; //string left = Spider.getValue(html, tag, attribute, attValue); //string tag2 = "dd"; //string attribute2 = "CLASS"; //string attValue2 = "total2"; //string right = Spider.getValue(html, tag2, attribute2, attValue2); //string grade = left + right; ////return float.Parse(grade); //return 1.1f; //throw new NotImplementedException(); Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter nodeFilter = new HasAttributeFilter("CLASS", "point ml12 px18"); NodeList nodeList = parser.ExtractAllNodesThatMatch(nodeFilter); if (nodeList.Count == 1) { INode node = nodeList[0]; ITag tagLeft = (node.FirstChild as ITag); ITag tagRight = (node.LastChild as ITag); string left = tagLeft.ToPlainTextString(); string right = tagRight.ToPlainTextString(); string strGrade = left + right; return float.Parse(strGrade); } return 7.0f; }
static void GetSubtitleHtmlFromFile() { List<List<NodeFilter>> filters = new List<List<NodeFilter>>(); filters.Add(new List<NodeFilter>()); filters.Add(new List<NodeFilter>()); filters.Add(new List<NodeFilter>()); filters.Add(new List<NodeFilter>()); filters.Add(new List<NodeFilter>()); filters.Add(new List<NodeFilter>()); filters.Add(new List<NodeFilter>()); filters.Add(new List<NodeFilter>()); filters[0].Add(new HasAttributeFilter("class", "xl29")); filters[0].Add(new HasAttributeFilter("class", "xl31")); filters[0].Add(new HasAttributeFilter("class", "xl32")); filters[0].Add(new HasAttributeFilter("class", "xl33")); filters[1].Add(new HasAttributeFilter("class", "xl25")); filters[1].Add(new HasAttributeFilter("class", "xl26")); filters[1].Add(new HasAttributeFilter("class", "xl27")); filters[1].Add(new HasAttributeFilter("class", "xl28")); filters[2].Add(new HasAttributeFilter("class", "xl27")); filters[2].Add(new HasAttributeFilter("class", "xl28")); filters[2].Add(new HasAttributeFilter("class", "xl29")); filters[2].Add(new HasAttributeFilter("class", "xl30")); filters[3].Add(new HasAttributeFilter("class", "xl27")); filters[3].Add(new HasAttributeFilter("class", "xl28")); filters[3].Add(new HasAttributeFilter("class", "xl29")); filters[3].Add(new OrFilter(new HasAttributeFilter("class", "xl31"), new HasAttributeFilter("class", "xl30"))); filters[4].Add(new HasAttributeFilter("class", "xl27")); filters[4].Add(new HasAttributeFilter("class", "xl28")); filters[4].Add(new HasAttributeFilter("class", "xl29")); filters[4].Add(new HasAttributeFilter("class", "xl30")); filters[5].Add(new HasAttributeFilter("class", "xl33")); filters[5].Add(new HasAttributeFilter("class", "xl32")); filters[5].Add(new HasAttributeFilter("class", "xl30")); filters[5].Add(new HasAttributeFilter("class", "xl28")); filters[6].Add(new HasAttributeFilter("class", "xl29")); filters[6].Add(new HasAttributeFilter("class", "xl30")); filters[6].Add(new HasAttributeFilter("class", "xl31")); filters[6].Add(new HasAttributeFilter("class", "xl32")); filters[7].Add(new HasAttributeFilter("class", "xl28")); filters[7].Add(new HasAttributeFilter("class", "xl24")); filters[7].Add(new HasAttributeFilter("class", "xl30")); filters[7].Add(new HasAttributeFilter("class", "xl31")); DirectoryInfo directory = new DirectoryInfo(@"D:\Download\魔戒三部曲电影导演评论字幕\mht"); int count = 0; foreach (FileInfo file in directory.GetFiles("*.htm")) { StreamReader reader = new StreamReader(file.FullName); string htmlContent = reader.ReadToEnd(); reader.Close(); string fileName = file.Name.Substring(0, file.Name.IndexOf('.')); Lexer lexer = new Lexer(htmlContent); Parser parser = new Parser(lexer); //红色的是演员解说 NodeList redNodeList = parser.ExtractAllNodesThatMatch(filters[count][0]); GetSubtitleFromHtml(redNodeList, fileName + "_演员解说"); //黄色的是导演编剧解说 parser.Reset(); NodeList yelloNodeList = parser.ExtractAllNodesThatMatch(filters[count][1]); GetSubtitleFromHtml(yelloNodeList, fileName + "_导演编剧解说"); //蓝色的是特技制作组 parser.Reset(); NodeList blueNodeList = parser.ExtractAllNodesThatMatch(filters[count][2]); GetSubtitleFromHtml(blueNodeList, fileName + "_特技制作组解说"); //绿色的是幕后制作团队 parser.Reset(); NodeList greenNodeList = parser.ExtractAllNodesThatMatch(filters[count][3]); GetSubtitleFromHtml(greenNodeList, fileName + "_幕后制作团队解说"); count++; } }
private string GetDetailString(NodeList nodeDetail) { string detailHtml = nodeDetail.ToHtml(); ReplaceNode(ref detailHtml, nodeDetail, typeof(ScriptTag)); ReplaceNode(ref detailHtml, nodeDetail, typeof(ATag)); detailHtml = detailHtml.Replace("<br />", "\r\n").Replace(" ", "").Replace("»", "").Replace("1010兼职网", "135free.com").Replace("所在地:", ""); detailHtml = Regex.Replace(detailHtml, @"信息编号:\d{0,}", ""); Lexer lexer = new Lexer(detailHtml); Parser parser = new Parser(lexer); NodeList list = parser.ExtractAllNodesThatMatch(new NodeClassFilter(typeof(Div))); if (list.Count > 0) { return list[0].ToPlainTextString(); } else { return ""; } }
protected string getPaperID(string paper_name) { string html_page = _HttpUtil.getPaperIDHTML(paper_name); if (html_page == null || html_page == "") { return null; } Parser p = new Parser(new Lexer(html_page)); TagNameFilter tag_f = new TagNameFilter("A"); HasAttributeFilter attr_f = new HasAttributeFilter("target", "_blank"); HasChildFilter child_f = new HasChildFilter(new PaperFilter(paper_name)); AndFilter af = new AndFilter(tag_f,attr_f); AndFilter aff = new AndFilter(af, child_f); NodeList childs = p.ExtractAllNodesThatMatch(aff); if (childs == null || childs.Count <= 0) { //Paper not found return null; } //TODO Multi Paper found INode node = childs[0]; if (node is ITag) { ITag t = node as ITag; string href = t.GetAttribute("href"); if (href != null && href != "") { string [] sp = href.Split(new char[]{'/'}); return sp[sp.Length - 1].Split(new char[]{'.'})[0]; } } //Not Found return null; }
public void ParseProduct(ATag a) { string html = GetHtml(a.Link); Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeFilter productArea = new HasAttributeFilter("id", "productyou"); NodeList nodes = parser.ExtractAllNodesThatMatch(productArea); ParseProductTitle(nodes); ParseProductShowPhoto(nodes); ParseProductDemoPhoto(nodes); ParsePorductDescribe(nodes); NodeFilter productAttributeArea = new HasAttributeFilter("class", "chans"); NodeList productAttributeAreaNodes = nodes.ExtractAllNodesThatMatch(productAttributeArea,true); NodeFilter productAttributes = new HasAttributeFilter("class", "cph"); NodeList productAttributeNodes = nodes.ExtractAllNodesThatMatch(productAttributes, true); int length = productAttributeNodes.Count; for (int i = 0; i < length; i++) { INode n = productAttributeNodes[i].Children[0]; string t =n.ToPlainTextString(); if (Regex.Match(t, @"^\s{0,}颜色", RegexOptions.IgnoreCase).Success) { ParseProductColors(n); } Console.WriteLine(); } }
/// <summary> /// 辅助函数:从HTML中获得用户信息 /// </summary> /// <param name="currentUserHtml">包含微博用户信息的HTML文本</param> private void GetUserInfoFromHtml(string currentUserHtml) { //配置相关的过滤器 HasAttributeFilter nickNameFilter = new HasAttributeFilter("class", "name"); HasAttributeFilter remarkNameFilter = new HasAttributeFilter("class", "CH"); HasAttributeFilter linkUrlFilter = new HasAttributeFilter("class", "pf_lin S_link1"); HasAttributeFilter selfIntroFilter = new HasAttributeFilter("class", "pf_intro bsp"); HasAttributeFilter tagsFilter = new HasAttributeFilter("class", "S_func1"); HasAttributeFilter profileFilter = new HasAttributeFilter("class", "tags"); Lexer lexer = new Lexer(currentUserHtml); Parser parser = new Parser(lexer); //获取微博名 NodeList nickNameNodeList = parser.ExtractAllNodesThatMatch(nickNameFilter); if (nickNameNodeList.Size() == 1) { user.NickName = ((Span)nickNameNodeList[0]).ToPlainTextString(); } else { Console.WriteLine("判断微博名的标准出错!"); } //注意此处:如果要重复使用parser,一定要在本次使用“完”、下次使用前调用reset,否则会出错 parser.Reset(); //获取备注名称 NodeList remarkNameNodeList = parser.ExtractAllNodesThatMatch(remarkNameFilter); if (remarkNameNodeList.Size() == 1 && remarkNameNodeList[0].GetType().Equals(typeof(Span))) { string str = ((Span)remarkNameNodeList[0]).ToPlainTextString(); //去掉头尾的括号 user.RemarkName = str.Substring(1, str.Length - 2); } else { Console.WriteLine("判断微博备注名称的标准出错!"); } parser.Reset(); //获取微博链接地址 NodeList linkUrlNodeList = parser.ExtractAllNodesThatMatch(linkUrlFilter); if (linkUrlNodeList.Size() == 1 && linkUrlNodeList[0].GetType().Equals(typeof(ATag))) { user.LinkURL = ((ATag)linkUrlNodeList[0]).StringText; } else { Console.WriteLine("判断微博链接地址的标准出错!"); } parser.Reset(); //获取自我描述 NodeList selfIntroNodeList = parser.ExtractAllNodesThatMatch(selfIntroFilter); if (selfIntroNodeList.Size() == 1 && selfIntroNodeList[0].Children[1].GetType().Equals(typeof(Span))) { user.SelfIntroduction = ((Span)selfIntroNodeList[0].Children[1]).GetAttribute("TITLE"); } else { Console.WriteLine("判断自我描述的标准出错!"); } parser.Reset(); //获取标签 NodeList tagsNodeList = parser.ExtractAllNodesThatMatch(tagsFilter); string str2 = ""; for (int i = 0; i < tagsNodeList.Size(); i++) { if (tagsNodeList[i].GetType().Equals(typeof(Span))) { str2 += ((Span)tagsNodeList[i]).ToPlainTextString() + " "; } } user.Tags = str2; parser.Reset(); //获取属性信息 NodeList profileNodeList = parser.ExtractAllNodesThatMatch(profileFilter); if (profileNodeList.Size() == 1) { //通过分析发现,有用的信息均处于<a>标记中,所以按<a>标记取。然后再分析是其中的文本还是<em>中的title NodeClassFilter aTagFilter = new NodeClassFilter(typeof(ATag)); NodeList profileList = profileNodeList[0].Children.ExtractAllNodesThatMatch(aTagFilter, true); for (int j = 0; j < profileList.Size(); j++) { ATag aTag = (ATag)profileList[j]; if (aTag.Attributes.Contains("TITLE")) { user.Profile += aTag.GetAttribute("TITLE") + " "; } else { //遇到含有node-type="infoSlide"的节点说明所有属性遍历结束 if (aTag.Attributes.Contains("NODE-TYPE") && aTag.GetAttribute("NODE-TYPE").Equals("infoSlide")) { break; } else { //包含<em>子节点的情况 if (aTag.Children[0].GetType().Equals(typeof(TagNode))) { TagNode tagNode = (TagNode)aTag.Children[0]; user.Profile += tagNode.GetAttribute("TITLE") + " "; } else { //直接把<a>标记包含的文本输出 user.Profile += aTag.StringText + " "; } } } } } else { Console.WriteLine("判断用户属性信息的标准出错!"); } }
protected ArrayList getPaperReferenceByID(ArrayList paper_id) { string html_page = _HttpUtil.getPaperReferenceHTML(paper_id); if (html_page == null || html_page == "") { return null; } Parser p = new Parser(new Lexer(html_page)); TagNameFilter tag_f = new TagNameFilter("div"); HasAttributeFilter attr_f = new HasAttributeFilter("id", "export_container"); AndFilter af = new AndFilter(tag_f, attr_f); NodeList childs = p.ExtractAllNodesThatMatch(af); if (childs == null || childs.Count <= 0) { return null; } INode node = childs[0]; NodeList ref_childs = node.Children; ArrayList ref_list = new ArrayList(); for (int i = 0; i < ref_childs.Count;++i ) { INode tmp = ref_childs[i]; if (tmp is ITag) { ITag tag = tmp as ITag; string str = tag.ToPlainTextString(); str = str.Replace('\r', ' ').Replace('\n',' '); str = str.Substring(str.IndexOf(']') + 1); //str = System.Text.RegularExpressions.Regex.Replace(str, @"^\[*\]$", ""); ref_list.Add(str); } } if (_Progressable != null) { _Progressable.onFinish(ref_list); } return ref_list; }
public List<Model.Play> getPlays(string xmlFile) { Match strCinema = Regex.Match(xmlFile, @"\d\d\d\d"); string cinemaID = strCinema.Value; string html = File.ReadAllText(xmlFile); List<Model.Play> plays = new List<Model.Play>(); Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); //获取影片列表的node string listAttribute = "METHOD"; string listAttValue = "mdShowtime"; NodeFilter movieListFilter = new HasAttributeFilter(listAttribute, listAttValue); NodeList movieNodeList = parser.ExtractAllNodesThatMatch(movieListFilter); if (movieNodeList.Count >= 1) { //获取每个电影的html for (int i = 0; i < movieNodeList.Count; i++) { INode node = movieNodeList[i]; string movieHtml = node.ToHtml(); Lexer movieLexer = new Lexer(movieHtml); Parser movieParser = new Parser(movieLexer); //获取影片ID NodeFilter idFilter = new HasAttributeFilter("CLASS", "c_000"); NodeList idNodes = movieParser.ExtractAllNodesThatMatch(idFilter); string strID = string.Empty; if (idNodes.Count >= 1) { ITag idTag = (idNodes[0] as ITag); if (idTag.Attributes != null) { string str = idTag.Attributes["HREF"].ToString(); Match match = Regex.Match(str, @"\d\d\d\d\d\d"); if (match.Success) { strID = match.Value;//电影的ID } else { Match ma = Regex.Match(str, @"\d\d\d\d\d"); if (ma.Success) { strID = ma.Value; } } //strID = match.Value;//电影的ID } } //获取影片播放时段列表 Lexer lexer2 = new Lexer(movieHtml); Parser movieParser2 = new Parser(lexer2); NodeFilter playFilter = new HasAttributeFilter("_TYPE", "expiry"); //NodeList playNodes = parser.ExtractAllNodesThatMatch(playFilter); NodeList playNodes = movieParser2.ExtractAllNodesThatMatch(playFilter); if (playNodes.Count >= 1) { for (int j = 0; j < playNodes.Count; j++) { Model.Play play = new Model.Play(); ITag playTag = (playNodes[j] as ITag); if (playTag.Attributes != null) { play.CinemaID = int.Parse(cinemaID); play.MovieID = int.Parse(strID); play.PlayID = int.Parse(playTag.Attributes["SHOWTIMEID"].ToString()); string strTime = playTag.Attributes["TIME"].ToString(); if (strTime == null || strTime == "") { continue; } strTime = strTime.Trim(); strTime = strTime.Remove(0, 10); play.PlayName = strTime.Trim(); //ITag tag2 = (playTag.FirstChild as ITag); //string strPrice = tag2.FirstChild.ToPlainTextString(); //playTag.FirstChild.FirstChild.ToPlainTextString(); string strPrice = playTag.FirstChild.NextSibling.FirstChild.NextSibling.ToPlainTextString(); if (strPrice != null&&strPrice!=""&&strPrice!=" ") { strPrice = strPrice.Trim(); strPrice = strPrice.Remove(0, 1); play.Price = float.Parse(strPrice); } else { play.Price = 0f; } plays.Add(play); } } } } return plays; } return null; //throw new NotImplementedException(); }