private void paserData(INode node) { ITag tag = getTag(node); if (tag != null && !tag.IsEndTag() && !start.Contains(tag.StartPosition)) { object oId = tag.GetAttribute("ID"); object oName = tag.GetAttribute("name"); object oClass = tag.GetAttribute("class"); parseResult += tag.TagName + ":\r\nID:" + oId + " Name:" + oName + " Class:" + oClass + " StartPosition:" + tag.StartPosition.ToString() + "\r\n"; start.Add(tag.StartPosition); } //子节点 if (node.Children != null && node.Children.Count > 0) { paserData(node.FirstChild); } //兄弟节点 INode siblingNode = node.NextSibling; while (siblingNode != null) { paserData(siblingNode); siblingNode = siblingNode.NextSibling; } }
/// <summary> Searches all children who for a name attribute. Returns first match.</summary> /// <param name="name">Attribute to match in tag /// </param> /// <returns> Tag Tag matching the name attribute /// </returns> public virtual ITag SearchByName(System.String name) { INode node; ITag tag = null; bool found = false; for (ISimpleNodeIterator e = GetChildren(); e.HasMoreNodes() && !found;) { node = e.NextNode(); if (node is ITag) { tag = (ITag)node; System.String nameAttribute = tag.GetAttribute("NAME"); if (nameAttribute != null && nameAttribute.Equals(name)) { found = true; } } } if (found) { return(tag); } else { return(null); } }
public List <String> start(String htmlContent) { Parser parser = new Parser(); parser.InputHTML = htmlContent; NodeList nodelist = parser.Parse(nodefilter); int size = nodelist.Size(); List <String> results = new List <String>(); for (int i = 0; i < size; i++) { INode node = nodelist.ElementAt(i); if (node is INode) { ITag tag = node as ITag; if (needValue == "href") { results.Add(tag.GetAttribute(needValue)); } else { results.Add(tag.FirstChild.GetText()); } } } return(results); }
/// <summary> /// 设置过滤条件 /// </summary> /// <param name="node"></param> /// <returns></returns> public bool Accept(INode node) { Boolean isAccept = false; if (node is ITag) { ITag tagNode = (ITag)node; if (tagNode.Attributes != null && tagNode.Attributes.Count > 0) { int totalNum = mRules.Count; int currentNum = 0; foreach (var item in mRules) { String getValue = tagNode.GetAttribute(item.Key); if (getValue == item.Value) { currentNum++; } } if (currentNum == totalNum) { isAccept = true; } } } return(isAccept); }
public bool Accept(INode node) { if (node is ITag) { ITag tag = node as ITag; if (tag.GetAttribute(key).StartsWith(value)) { return(true); } } return(false); }
public static void parseIndexHtml(string HtmlString, int num, out string href, out string title) { href = ""; title = ""; //进行解析 Parser parser = Parser.CreateParser(HtmlString, "utf-8"); //筛选要查找的对象 这里查找td,封装成过滤器 NodeFilter filter = new TagNameFilter("dd"); new AndFilter(new TagNameFilter("dd"), new HasParentFilter(new AndFilter(new TagNameFilter("dl"), new HasAttributeFilter("class", "txt_box")))); //将过滤器导入筛选,得到对象列表 NodeList nodes = parser.Parse(filter); if (nodes.Size() > num) { INode textnode = nodes[num]; ITag tag = getTag(textnode.FirstChild.NextSibling); href = tag.GetAttribute("href"); title = tag.GetAttribute("title"); } }
static void GetMenus() { string str = GetHtml("https://www.qisuu.la/du/23/23361/", true); Lexer lexer = new Lexer(str); Parser parser = new Parser(lexer); NodeList htmlNodes = parser.Parse(new HasAttributeFilter("class", "pc_list")); IList <int> start = new List <int>(); List <INode> nodes = new List <INode>(); for (int i = 1; i < htmlNodes.Count; i++) { ForeachNode(htmlNodes[i], (node) => { ITag tag = node as ITag; object cls = tag.GetAttribute("href"); if (cls != null) { nodes.Add(node); } }); } foreach (INode node in nodes) { ITag tag = node as ITag; string href = tag.GetAttribute("href"); IText nextNode = node.Children[0] as IText; string title = nextNode.GetText(); str = GetHtml("https://www.qisuu.la/du/23/23361/" + href, true); lexer = new Lexer(str); parser = new Parser(lexer); htmlNodes = parser.Parse(new HasAttributeFilter("id", "content1")); string result = ""; for (int i = 0; i < htmlNodes[0].Children.Count; i++) { INode txt = htmlNodes[0].Children[i] as INode; if (txt is IText) { string line = txt.GetText(); line = line.Replace(" ", "").Trim(); if (line != "") { result += line + "\n"; } } } Console.WriteLine(title); File.WriteAllText("books/" + title, result); } }
static List <string> GetUrls() { List <string> ret = new List <string>(); string url = "https://wallhaven.cc/search?"; Dictionary <string, string> dic = new Dictionary <string, string>() { { "q", "明日方舟" }, { "categories", "111" }, { "purity", "100" }, { "resolutions", "1280x800,1600x1000,1920x1200,2560x1600,3840x2400" }, { "sorting", "relevance" }, { "order", "desc" }, }; foreach (var key in dic.Keys) { url += $"{key}={WebUtility.UrlEncode(dic[key])}&"; } WebClient myWebClient = new WebClient(); string html = myWebClient.DownloadString(url); Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeList htmlNodes = parser.Parse(new HasAttributeFilter("class", "lazyload")); for (int i = 0; i < htmlNodes.Count; i++) { INode node = htmlNodes[i]; ITag tag = (ITag)node; string src = tag.GetAttribute("data-src"); ITag png = (ITag)(node.Parent.Children[3].Children[2]); string val = png.GetAttribute("class"); val = val == "png" ? val : "jpg"; src = src.Replace(".jpg", $".{val}"); src = src.Replace("https://th.wallhaven.cc/small/", "https://w.wallhaven.cc/full/"); string[] arr = src.Split('/'); string str = $"wallhaven-{src.Substring(src.LastIndexOf('/') + 1)}"; src = src.Replace(arr[arr.Length - 1], str); Console.WriteLine(src); ret.Add(src); } return(ret); }
static void AuthorBooks() { string url = "https://www.qisuu.la/Writer/13.html"; string html = GetHtml(url, true); Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeList htmlNodes = parser.Parse(new HasAttributeFilter("class", "listBox")); INode node = htmlNodes[0]; string title = node.Children[1].Children[1].Children[0].GetText(); Console.WriteLine(title); INode ul = node.Children[3]; for (int i = 1; i < ul.Children.Count; i += 2) { INode li = ul.Children[i]; ITag s = (ITag)li.Children[1]; ITag a = (ITag)li.Children[3]; ITag u = (ITag)li.Children[5]; string bookTitle = a.Children[1].GetText(); string bookImgSrc = ((ITag)a.Children[0]).GetAttribute("src"); string bookHref = a.GetAttribute("href").Replace("/Shtml", "").Replace(".html", ""); string bookDesc = u.Children[0].GetText(); Console.WriteLine(bookTitle); Console.WriteLine(bookImgSrc); Console.WriteLine(bookHref); Console.WriteLine(bookDesc); string author = s.Children[0].GetText() + s.Children[1].Children[0].GetText(); string status = s.Children[3].GetText(); string size = s.Children[5].GetText(); string words = s.Children[7].GetText(); Console.WriteLine(author); Console.WriteLine(status); Console.WriteLine(size); Console.WriteLine(words); Console.WriteLine(); } }
/// <summary> /// 对第一层页面进行解析,返回第二层页面url /// </summary> /// <param name="url"></param> /// <returns></returns> public static string GetUrl(string url) { try { StringBuilder linkUrl = new StringBuilder(); //过滤条件:获得所有li标签 NodeFilter filterFirst = new AndFilter(new TagNameFilter("li"), new HasAttributeFilter("class", "hui3")); //获得所有li标签 NodeList linkNodeList = GetNodeList(url, filterFirst); //获取第一条数据(最新的一条) ITag tempTag1 = linkNodeList[0] as ITag; ITag tempTag2 = tempTag1.LastChild as ITag; //拼接url linkUrl.Append("http://www.cjmsa.gov.cn"); linkUrl.Append(tempTag2.GetAttribute("href")); return(linkUrl.ToString()); } catch (Exception ex) { Console.WriteLine("解析第一层出错" + ex.Message); return("failed"); } }
internal virtual void ConvertFromTag(ITag obTag) { this.m_strId = obTag.GetAttribute("ID"); this.m_strName = obTag.GetAttribute("NAME"); }
public ActionResult YiLong() { string title = ""; List <string> secnicNameList = new List <string>(); List <string> secnicLocList = new List <string>(); //for (int num = 1; num <= 2 ; num++ ) //{ string strBaidu; strBaidu = "http://trip.elong.com/shanghai/jingdian/"; Uri uri = new Uri(strBaidu); WebRequest webRequest = WebRequest.Create(uri); httpWebRequest = (HttpWebRequest)webRequest; // 发送请求信息 WebResponse webResponse = httpWebRequest.GetResponse(); // 获得响应信息 httpWebResponse = (HttpWebResponse)webResponse; // 获得从当前Internet资源返回的响应流数据 Stream stream = httpWebResponse.GetResponseStream(); // 利用获得的响应流和系统缺省编码来初始化StreamReader实例。 StreamReader sr = new StreamReader(stream, Encoding.UTF8); string strResult = sr.ReadToEnd(); //从响应流从读取数据 sr.Close(); //通过Winista.HtmlParser解析Html Parser parser = Parser.CreateParser(strResult, "utf-8"); //utf-8 NodeFilter filterDiv = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "zyjd-li")); NodeList nodeList = parser.Parse(filterDiv); NodeList subNodeList = nodeList[0].Children[1].Children; if (subNodeList.Count > 0) { for (int i = 0; i < subNodeList.Count; i++) { ITag tempTag = subNodeList[i] as ITag; if (tempTag != null) { ITag titleTag = tempTag.Children[0] as ITag; secnicNameList.Add(titleTag.GetAttribute("TITLE").ToString()); title += titleTag.GetAttribute("TITLE").ToString() + ";"; } } } //通过newtonsoft.json解析Json //JObject jo = (JObject)JsonConvert.DeserializeObject(strResult); //IList<string> secnicNames = jo["sights"].Select(m => (string)m.SelectToken("name")).ToList(); //IList<string> SecnicLng = jo["sights"].Select(m => (string)m.SelectToken("longitude")).ToList(); //IList<string> SecnicLat = jo["sights"].Select(m => (string)m.SelectToken("latitude")).ToList(); //for (int i = 0; i < secnicNames.Count; i++) //{ // secnicNameList.Add(secnicNames[i]); // secnicLocList.Add(SecnicLng[i]+","+SecnicLat[i]); // title += secnicNames[i] + " ; "; //} //} //保存XML xmlDoc = new XmlDocument(); //加入XML的声明段落,<?xml version="1.0" encoding="gb2312"?> XmlDeclaration xmldecl; xmldecl = xmlDoc.CreateXmlDeclaration("1.0", "gb2312", null); xmlDoc.AppendChild(xmldecl); //加入一个根元素 xmlElement = xmlDoc.CreateElement("", "scenic", ""); xmlDoc.AppendChild(xmlElement); for (int i = 0; i < secnicNameList.Count; i++) { XmlNode root = xmlDoc.SelectSingleNode("scenic"); //查找<scenic> XmlElement xeItem = xmlDoc.CreateElement("item"); //创建一个<item>节点 XmlElement xeName = xmlDoc.CreateElement("scenicname"); xeName.InnerText = secnicNameList[i]; //设置文本节点 xeItem.AppendChild(xeName); //添加到<item>节点中 //string strLng = secnicLocList[i].Split(',')[0]; //string strLat = secnicLocList[i].Split(',')[1]; //XmlElement xeLng = xmlDoc.CreateElement("lng"); //xeLng.InnerText = strLng;//设置文本节点 //xeItem.AppendChild(xeLng);//添加到<item>节点中 //XmlElement xeLat = xmlDoc.CreateElement("lat"); //xeLat.InnerText = strLat;//设置文本节点 //xeItem.AppendChild(xeLat);//添加到<item>节点中 root.AppendChild(xeItem);//添加到<scenic>节点中 } //保存创建好的XML文档 xmlDoc.Save(Server.MapPath("data1.xml")); ViewBag.Message = title; return(View()); }
private void GrapBaiduMsg(HttpContext context) { string sRslt = GetHtmlStr("http://www.yododo.com/ask/list/"); ClassLibrary.BLL.RouteClass rcBll = new ClassLibrary.BLL.RouteClass(); List <ClassLibrary.Model.RouteClass> rcList = rcBll.GetModelList("classLevel = 3"); Parser parser = Parser.CreateParser(sRslt, "utf-8"); NodeFilter filterUL = new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "miniarea-list clearfix")); NodeList liList = parser.Parse(filterUL); string links = liList[0].ToHtml(); parser = Parser.CreateParser(links, "utf-8"); NodeFilter filterLI = new TagNameFilter("li"); //new NodeClassFilter(typeof(ATag)); NodeList nodelist = parser.Parse(filterLI); //string strGn = nodelist[1].ToHtml(); string strCj = nodelist[0].ToHtml(); //parser = Parser.CreateParser(nodelist.ToHtml(), "utf-8"); NodeFilter filterA = new NodeClassFilter(typeof(ATag)); /*NodeList aGnList = parser.Parse(filterA); * for (int i = 0; i < aGnList.Count; i++) * { * ITag tag = getTag(aGnList[i]); * string url = "http://www.yododo.com" + tag.GetAttribute("href") + "s1"; //已解决 * string className = tag.ToPlainTextString(); * if (className == "全部") continue; * * ClassLibrary.Model.RouteClass model = rcList.Find(delegate(ClassLibrary.Model.RouteClass rc) { return rc.ClassName == className; }); * if (model == null) continue; * * paserData(aGnList[i], url, model.ID); * }*/ parser = Parser.CreateParser(strCj, "utf-8"); NodeList areaCjList = parser.Parse(filterA); for (int i = 0; i < areaCjList.Count; i++) { ITag tag = getTag(areaCjList[i]); string url = "http://www.yododo.com" + tag.GetAttribute("href"); //各洲 string className = tag.ToPlainTextString(); if (className == "全部" || className == "中国") { continue; } parser = Parser.CreateParser(GetHtmlStr(url), "utf-8"); //NodeFilter filterUL = new AndFilter(new TagNameFilter("ul"), new HasAttributeFilter("class", "miniarea-list clearfix")); NodeList liListCj = parser.Parse(filterUL); string linksCj = liListCj[0].ToHtml(); parser = Parser.CreateParser(linksCj, "utf-8"); //NodeFilter filterA = new NodeClassFilter(typeof(ATag)); NodeList aCjList = parser.Parse(filterA); for (int j = 0; j < aCjList.Count; j++) { ITag cjtag = getTag(aCjList[j]); string url1 = "http://www.yododo.com" + cjtag.GetAttribute("href") + "s1"; //已解决 string className1 = cjtag.ToPlainTextString(); if (className1 == "全部") { continue; } ClassLibrary.Model.RouteClass model = rcList.Find(delegate(ClassLibrary.Model.RouteClass rc) { return(rc.ClassName == className1); }); if (model == null) { continue; } paserData(aCjList[j], url1, model.ID); } } Print(context, "success"); }
static void download_url(string url) { HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url); request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"; request.Timeout = 30000; try { using (HttpWebResponse response = (HttpWebResponse)request.GetResponse()) { if (response.StatusCode == HttpStatusCode.OK) { using (Stream s = response.GetResponseStream()) { using (StreamReader sr = new StreamReader(s, Encoding.UTF8)) { string html = sr.ReadToEnd(); string encode = HttpUtility.HtmlDecode(html); download_pic(encode); Lexer lexer = new Lexer(encode); Parser par = new Parser(lexer); NodeFilter nodefilter = new TagNameFilter("a"); NodeList nodes = par.ExtractAllNodesThatMatch(nodefilter); for (int i = 0; i < nodes.Count; i++) { ITag tag = nodes[i] as ITag; bool isexist = false; foreach (string ss in links) { if (ss == tag.GetAttribute("href")) { isexist = true; break; } } if (!isexist) { links.Add(tag.GetAttribute("href")); Console.WriteLine("accessing " + "http://taylorpictures.net/" + tag.GetAttribute("href")); using (FileStream fs = new FileStream(@"e:/Photos/crawl_log.txt", FileMode.Append)) { byte[] bytes = Encoding.UTF8.GetBytes("accessing " + "http://taylorpictures.net/" + tag.GetAttribute("href") + "\r\n"); fs.Write(bytes, 0, bytes.Length); } download_url("http://taylorpictures.net/" + tag.GetAttribute("href")); } else { continue; } } } } } else { Console.WriteLine("Error"); } } } catch { Console.WriteLine("404"); } }
static void SearchBook() { string query = WebUtility.HtmlEncode("唐家三少"); string url = $"http://zhannei.baidu.com/cse/site/?cc=qisuu.la&s=11735927224209550458&q=" + query; string html = GetHtml(url); Lexer lexer = new Lexer(html); Parser parser = new Parser(lexer); NodeList htmlNodes = parser.Parse(new HasAttributeFilter("class", "result f s0")); for (int i = 0; i < htmlNodes.Count; i++) { { INode node = htmlNodes[i]; INode nodeTitle = node.Children[1]; INode nodeATitle = nodeTitle.Children[1]; ITag aTag = nodeATitle as ITag; object attribute = aTag.GetAttribute("href"); if (attribute != null) { string href = attribute.ToString().Trim(); href = Regex.Replace(href, "^http.*//", ""); { int idx = href.IndexOf("/?"); if (idx != -1) { href = href.Substring(0, idx); } } { int idx = href.IndexOf("/"); if (idx != -1) { href = href.Substring(idx + 1); } } href = href.Replace("Shtml", ""); href = href.Replace(".html", ""); if (href.Contains("du")) { href = href.Split('/')[2]; } if (href.Contains("Writer")) { href = href.Split('/')[1]; Console.WriteLine("作者"); } if (href != "") { Console.WriteLine($"href:{href}"); } } { string title = ""; for (int j = 0; j < nodeATitle.Children.Count; j++) { INode tempNode = nodeATitle.Children[j]; if (tempNode is IText) { title += tempNode.GetText().Trim(); } } title = title.Trim(); if (title != "") { Console.WriteLine($"title:{title}"); } } { INode nodeDiv = node.Children[3]; ForeachNode(nodeDiv, (_node) => { ITag tag = _node as ITag; object _class = tag.GetAttribute("class"); if (_class != null && _class.ToString() == "c-abstract") { string desc = ""; for (int j = 0; j < tag.Children.Count; j++) { INode tempNode = tag.Children[j]; if (tempNode is IText) { desc += tempNode.GetText().Trim(); } } desc = desc.Trim(); if (desc != "") { Console.WriteLine($"desc:{desc}"); } } }); } Console.WriteLine(); } //ForeachNode(htmlNodes[i], (node) => //{ // ITag tag = node as ITag; // object cpos = tag.GetAttribute("cpos"); // object href = tag.GetAttribute("href"); // object _class = tag.GetAttribute("class"); // if (_class != null && (string)_class == "c-abstract") // { // string content = ""; // for (int j = 0; j < node.Children.Count; j++) // { // if (node.Children[j] is IText) // { // string str = node.Children[j].GetText(); // str = str.Trim(); // if (str != "") // { // content += str + " "; // } // } // } // content = content.Trim(); // if (content != "") // { // Console.WriteLine(content); // } // } // if (href != null && (string)cpos == "title") // { // string line = ""; // for (int j = 0; j < node.Children.Count; j++) // { // if (node.Children[j] is IText) // { // string str = node.Children[j].GetText(); // str = str.Trim(); // if (str != "") // { // line += str + " "; // } // } // } // line = line.Trim(); // if (line != "") // { // Console.WriteLine(line); // Console.WriteLine(href); // } // } //}); } }