public static string GetAttr(NSoup.Nodes.Element el, string name) { if (el != null) { return(el.Attributes[name] ?? null); } return(null); }
public static 列表 <NSoup.Nodes.Element> Select(NSoup.Nodes.Element element, string cssQuery) { NSoup.Select.Elements elements = element.Select(cssQuery); NSoup.Nodes.Element[] earray = elements.ToArray(); 列表 <NSoup.Nodes.Element> list = new 列表 <NSoup.Nodes.Element>(earray); return(list); }
public List <String> FindImages(String question, String userAgent) { List <String> imagesList = new List <String>(); try { String googleUrl = "https://www.google.com/search?tbm=isch&q=" + question.Replace(",", ""); NSoup.Nodes.Document htmlDoc = NSoupClient.Connect(googleUrl).UserAgent(userAgent).Timeout(10 * 1000).Get(); //Handling correctly auto redirects... checkForRedirectsOnHTMLDocument(ref htmlDoc, userAgent); /* * //This is old method * NSoup.Select.Elements images = htmlDoc.Select("div.rg_di.rg_el.ivg-i img"); //div with class="rg_di rg_el ivg-i" containing img * foreach (NSoup.Nodes.Element img in images) { * NSoup.Select.Elements links = img.Parent.Select("a[href]"); * if (links.Count() > 0) { //is there a link around img? * NSoup.Nodes.Element link = img.Parent.Select("a[href]").First(); * String href = img.Parent.Attr("abs:href"); //link which needs to be parsed to get the full img url * Regex regex = new Regex("imgurl=(.*?)&imgrefurl="); //Everything between "imgurl=" and "&imgrefurl=" * var v = regex.Match(href); * if (v != null && v.Groups.Count == 2) { * if (v.Groups[1].Value != String.Empty) { * String imgURL = v.Groups[1].ToString(); * imagesList.Add(imgURL); * } * } * } * } */ NSoup.Select.Elements div_with_images = htmlDoc.Select("div.y.yi div.rg_di.rg_bx.rg_el.ivg-i"); //div with class="y yi" containing div with class="rg_di rg_bx rg_el ivg-i" foreach (NSoup.Nodes.Element div_with_image in div_with_images) { NSoup.Nodes.Element rg_meta_div = div_with_image.Select("div.rg_meta").First(); String text_where_the_img_is = rg_meta_div.ToString(); Regex regex = new Regex("ou":"(.*?)""); //Everything between "ou":"" and """ var v = regex.Match(text_where_the_img_is); if (v != null && v.Groups.Count == 2) { if (v.Groups[1].Value != String.Empty) { String imgURL = v.Groups[1].ToString(); imagesList.Add(imgURL); } } } } catch (Exception ex) { this.Error = ex; } return(imagesList); }
private void btnContent_Click(object sender, EventArgs e) { ContentHandle handle = new ContentHandle(); string url = txtUrl.Text.Trim(); string html = handle.ClearTag(url); NSoup.Nodes.Element list = handle.BodyElement(html); string content = handle.GenerateElement(list); webContent.DocumentText = content; }
/// <summary> /// 解析主页的访问用户 /// </summary> /// <returns></returns> public void VisitParser() { this.Login(); Result result = this.Request(this.HomePageUrl); string pageHtml = result.Msg; if (string.IsNullOrWhiteSpace(pageHtml)) { return; } NSoup.Nodes.Document doc = NSoup.NSoupClient.Parse(pageHtml); NSoup.Nodes.Element wrapElement = doc.GetElementById("show_style_01"); NSoup.Select.Elements userElementNodes = wrapElement.GetElementsByTag("li"); if (userElementNodes != null && userElementNodes.Count > 0) { //倒序排列,最新的在最后面 IEnumerable <NSoup.Nodes.Element> userElements = userElementNodes.Reverse(); foreach (NSoup.Nodes.Element userElement in userElements) { NSoup.Nodes.Element picElement = GetElementFirst(userElement.GetElementsByClass("pic")); NSoup.Nodes.Element nameElement = GetElementFirst(userElement.GetElementsByClass("user_name")); NSoup.Nodes.Element userInfoElement = GetElementFirst(userElement.GetElementsByClass("user_info")); NSoup.Nodes.Element dateElement = GetElementFirst(userElement.GetElementsByClass("date")); string userName = nameElement == null ? "" : nameElement.Child(0).Text(); string homePage = UriHelper.RemoveParams(nameElement == null ? "" : nameElement.Child(0).Attr("href")); string pic = picElement.Child(0).Child(0).Attr("src"); DateTime date = Convert.ToDateTime(dateElement.Text().Replace("到访:", "")); string[] userInfo = StringHelper.SplitWhiteSpace(userInfoElement.Child(0).Text()); int age = Convert.ToInt32(userInfo[0].Replace("岁", "")); string addr = userInfo.Length > 1 ? userInfo[1] : string.Empty; string userCode = homePage.Substring(homePage.LastIndexOf('/') + 1); if (addr.Contains("广州") && !string.IsNullOrWhiteSpace(userCode)) { FateUserInfo user = FateUserInfoManager.GetUser(userCode); if (user == null) { user = new FateUserInfo() { CreateTime = DateTime.Now }; } user.ModifyTime = DateTime.Now; user.UserCode = userCode; user.Address = addr; user.Age = age; user.HeadFileName = pic; user.UserName = userName; FateUserInfoManager.SaveOrUpdateUser(user); } } } }
/// <summary> /// 解析H3中的文本链接 /// </summary> private string GetContentLink(NSoup.Nodes.Element el) { string content = el.Text(); string link = string.Empty; if (!string.IsNullOrWhiteSpace(content)) { List <string> contents = Regex.Split(content, ":|:").ToList(); if (contents != null && contents.Count > 1) { contents.RemoveAt(0); link = string.Join(":", contents); } } return(link); }
private List <string> GetSpanContentLink(NSoup.Nodes.Element el) { string content = el.Html(); List <string> links = new List <string>(); if (!string.IsNullOrEmpty(content)) { List <string> contents = Regex.Split(content, "<br>|<br />|<br/>|<br >", RegexOptions.IgnoreCase).ToList(); if (contents != null && contents.Count > 1) { foreach (string c in contents) { if (IsDownloadLink(c)) { links.Add(c); } } } } return(links); }
public static string GetText(NSoup.Nodes.Element el) { return(el == null ? null : el.Text()); }
private string GetHref(NSoup.Nodes.Element el) { return((el.Attr("href") ?? string.Empty).Trim()); }