static void find_links(List <WebLink> links, WebLinkType link_type, Cliver.DataSifter.Parser parser, string url, string page) { lock (static_lock_object) { Cliver.DataSifter.Capture gc = parser.Parse(page); foreach (Cliver.DataSifter.Capture tag in gc["Tag"]) { Cliver.DataSifter.Capture html = tag.FirstOf("Html"); if (html == null) { continue; } string u = html.ValueOf("Url"); if (u == null) { continue; } u = Spider.GetAbsoluteUrl(u, url); if (u == null) { continue; } links.Add(new WebLink(u, tag.ValueOf("Content"), html.ValueOf("Title"), link_type, tag.Index)); } } }
/// <summary> /// Finds web links in the page /// </summary> /// <param name="url">absolute url of parsed page</param> /// <param name="page">string to be parsed for web links</param> /// <returns>absolute links</returns> public static List<WebLink> GetWebLinks(string url, string page, WebLinkType link_type) { page = Spider.PreparePage(page); url = Spider.GetBaseUri(url, page).ToString(); List<WebLink> links = new List<WebLink>(); //anchors if ((link_type & WebLinkType.Anchor) == WebLinkType.Anchor) find_links(links, WebLinkType.Anchor, HtmlAnchors, url, page); //areas if ((link_type & WebLinkType.Area) == WebLinkType.Area) find_links(links, WebLinkType.Area, HtmlAreas, url, page); //meta tag url if ((link_type & WebLinkType.MetaTag) == WebLinkType.MetaTag) { foreach (Match mm in Regex.Matches(page, @"<META [^>]*CONTENT\s*=\s*([""']).+?;\s*URL\s*=\s*(?'Url'.*?)(?:\1|>)", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase)) links.Add(new WebLink(mm.Groups["Url"].Value, null, null, WebLinkType.MetaTag, mm.Index)); } ////images //if ((link_type & WebLinkType.Image) == WebLinkType.Image) //{ // m = Regex.Match(page, @"<(?'Body'(?'Tag'img)\s(?:[^>]*?\s)?src\s*=\s*(?'quotation'[\'\""])?(?'Url'.*?)(?:\k'quotation'[^>]*?)?/\s*>", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase); // while (m.Success) // { // string title = null; // Match mm = Regex.Match(m.Groups["Body"].Value, @"\salt\s*=\s*(?'quotation'[\'\""])?(?'Title'.*?)(?:\k'quotation'|$)", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase); // if (mm.Success) // title = mm.Groups["Title"].Value; // links[m.Groups["Url"].Value] = new WebLink(GetAbsoluteUrl(m.Groups["Url"].Value, url), null, title, WebLinkType.Image); // m = m.NextMatch(); // } //} //frames if ((link_type & WebLinkType.Frame) == WebLinkType.Frame) find_links(links, WebLinkType.Frame, HtmlFrames, url, page); //forms if ((link_type & WebLinkType.Form) == WebLinkType.Form) find_links(links, WebLinkType.Form, HtmlForms, url, page); //javascript links if ((link_type & WebLinkType.Javascript) == WebLinkType.Javascript) { foreach(Match mm in Regex.Matches(page, @"(?:location.href|window.open)\((['""])(?'Url'[^\>\;]+?)\1\)", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase)) links.Add(new WebLink(mm.Groups["Url"].Value, null, null, WebLinkType.Javascript, mm.Index)); } //WebLink[] wls = new WebLink[links.Count]; //links.CopyTo(wls, 0); //return wls; return links; }
internal WebLink(string url, string text, string title, WebLinkType link_type, int index) { if (url != null) Url = HttpUtility.HtmlDecode(url).Trim(); Text = text; Title = title; WebLinkType = link_type; IndexInPreparedPage = index; }
internal WebLink(string url, string text, string title, WebLinkType link_type, int index) { if (url != null) { Url = HttpUtility.HtmlDecode(url).Trim(); } Text = text; Title = title; WebLinkType = link_type; IndexInPreparedPage = index; }
static void find_links(List<WebLink> links, WebLinkType link_type, Cliver.DataSifter.Parser parser, string url, string page) { lock (static_lock_object) { Cliver.DataSifter.Capture gc = parser.Parse(page); foreach (Cliver.DataSifter.Capture tag in gc["Tag"]) { Cliver.DataSifter.Capture html = tag.FirstOf("Html"); if (html == null) continue; string u = html.ValueOf("Url"); if (u == null) continue; u = Spider.GetAbsoluteUrl(u, url); if (u == null) continue; links.Add(new WebLink(u, tag.ValueOf("Content"), html.ValueOf("Title"), link_type, tag.Index)); } } }
/// <summary> /// Finds web links in the page /// </summary> /// <param name="url">absolute url of parsed page</param> /// <param name="page">string to be parsed for web links</param> /// <returns>absolute links</returns> public static List <WebLink> GetWebLinks(string url, string page, WebLinkType link_type) { page = Spider.PreparePage(page); url = Spider.GetBaseUri(url, page).ToString(); List <WebLink> links = new List <WebLink>(); //anchors if ((link_type & WebLinkType.Anchor) == WebLinkType.Anchor) { find_links(links, WebLinkType.Anchor, HtmlAnchors, url, page); } //areas if ((link_type & WebLinkType.Area) == WebLinkType.Area) { find_links(links, WebLinkType.Area, HtmlAreas, url, page); } //meta tag url if ((link_type & WebLinkType.MetaTag) == WebLinkType.MetaTag) { foreach (Match mm in Regex.Matches(page, @"<META [^>]*CONTENT\s*=\s*([""']).+?;\s*URL\s*=\s*(?'Url'.*?)(?:\1|>)", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase)) { links.Add(new WebLink(mm.Groups["Url"].Value, null, null, WebLinkType.MetaTag, mm.Index)); } } ////images //if ((link_type & WebLinkType.Image) == WebLinkType.Image) //{ // m = Regex.Match(page, @"<(?'Body'(?'Tag'img)\s(?:[^>]*?\s)?src\s*=\s*(?'quotation'[\'\""])?(?'Url'.*?)(?:\k'quotation'[^>]*?)?/\s*>", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase); // while (m.Success) // { // string title = null; // Match mm = Regex.Match(m.Groups["Body"].Value, @"\salt\s*=\s*(?'quotation'[\'\""])?(?'Title'.*?)(?:\k'quotation'|$)", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase); // if (mm.Success) // title = mm.Groups["Title"].Value; // links[m.Groups["Url"].Value] = new WebLink(GetAbsoluteUrl(m.Groups["Url"].Value, url), null, title, WebLinkType.Image); // m = m.NextMatch(); // } //} //frames if ((link_type & WebLinkType.Frame) == WebLinkType.Frame) { find_links(links, WebLinkType.Frame, HtmlFrames, url, page); } //forms if ((link_type & WebLinkType.Form) == WebLinkType.Form) { find_links(links, WebLinkType.Form, HtmlForms, url, page); } //javascript links if ((link_type & WebLinkType.Javascript) == WebLinkType.Javascript) { foreach (Match mm in Regex.Matches(page, @"(?:location.href|window.open)\((['""])(?'Url'[^\>\;]+?)\1\)", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase)) { links.Add(new WebLink(mm.Groups["Url"].Value, null, null, WebLinkType.Javascript, mm.Index)); } } //WebLink[] wls = new WebLink[links.Count]; //links.CopyTo(wls, 0); //return wls; return(links); }
/// <summary> /// Finds web links in the page /// </summary> /// <returns>absolute links</returns> public List<WebLink> GetWebLinks(WebLinkType link_type) { lock (parent_document) { List<WebLink> links = new List<WebLink>(); //anchors if ((link_type & WebLinkType.Anchor) == WebLinkType.Anchor) { HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//a"); if (hnc != null) { foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x)) if (hn.Attributes["href"] != null) links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["href"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Anchor, hn.StreamPosition)); } } //areas if ((link_type & WebLinkType.Area) == WebLinkType.Area) { HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//area"); if (hnc != null) { foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x)) if (hn.Attributes["src"] != null) links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["src"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Area, hn.StreamPosition)); } } //meta tag url if ((link_type & WebLinkType.MetaTag) == WebLinkType.MetaTag) { HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//meta"); if (hnc != null) { foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x)) if (hn.Attributes["url"] != null) links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["url"].Value), hn.InnerText, null, WebLinkType.MetaTag, hn.StreamPosition)); } } //images if ((link_type & WebLinkType.Image) == WebLinkType.Image) { HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//img"); if (hnc != null) { foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x)) if (hn.Attributes["src"] != null) links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["src"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Image, hn.StreamPosition)); } } //frames if ((link_type & WebLinkType.Frame) == WebLinkType.Frame) { HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//frame"); if (hnc != null) { foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x)) if (hn.Attributes["src"] != null) links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["src"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Frame, hn.StreamPosition)); } } //forms if ((link_type & WebLinkType.Form) == WebLinkType.Form) { HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//form"); if (hnc != null) { foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x)) if(hn.Attributes["action"] != null) links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["action"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Form, hn.StreamPosition)); } } //javascript links if ((link_type & WebLinkType.Javascript) == WebLinkType.Javascript) { HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//script"); if (hnc != null) { foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x)) { foreach (Match mm in Regex.Matches(hn.InnerText, @"(?:location.href|window.open)\((['""])(?'Url'[^\>\;]+?)\1\)", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase)) links.Add(new WebLink(GetAbsoluteUrl(mm.Groups["Url"].Value), null, null, WebLinkType.Javascript, hn.StreamPosition + mm.Index)); } } } return links; } }
/// <summary> /// Finds web links in the page /// </summary> /// <returns>absolute links</returns> public List <WebLink> GetWebLinks(WebLinkType link_type) { lock (parent_document) { List <WebLink> links = new List <WebLink>(); //anchors if ((link_type & WebLinkType.Anchor) == WebLinkType.Anchor) { HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//a"); if (hnc != null) { foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x)) { if (hn.Attributes["href"] != null) { links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["href"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Anchor, hn.StreamPosition)); } } } } //areas if ((link_type & WebLinkType.Area) == WebLinkType.Area) { HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//area"); if (hnc != null) { foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x)) { if (hn.Attributes["src"] != null) { links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["src"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Area, hn.StreamPosition)); } } } } //meta tag url if ((link_type & WebLinkType.MetaTag) == WebLinkType.MetaTag) { HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//meta"); if (hnc != null) { foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x)) { if (hn.Attributes["url"] != null) { links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["url"].Value), hn.InnerText, null, WebLinkType.MetaTag, hn.StreamPosition)); } } } } //images if ((link_type & WebLinkType.Image) == WebLinkType.Image) { HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//img"); if (hnc != null) { foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x)) { if (hn.Attributes["src"] != null) { links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["src"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Image, hn.StreamPosition)); } } } } //frames if ((link_type & WebLinkType.Frame) == WebLinkType.Frame) { HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//frame"); if (hnc != null) { foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x)) { if (hn.Attributes["src"] != null) { links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["src"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Frame, hn.StreamPosition)); } } } } //forms if ((link_type & WebLinkType.Form) == WebLinkType.Form) { HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//form"); if (hnc != null) { foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x)) { if (hn.Attributes["action"] != null) { links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["action"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Form, hn.StreamPosition)); } } } } //javascript links if ((link_type & WebLinkType.Javascript) == WebLinkType.Javascript) { HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//script"); if (hnc != null) { foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x)) { foreach (Match mm in Regex.Matches(hn.InnerText, @"(?:location.href|window.open)\((['""])(?'Url'[^\>\;]+?)\1\)", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase)) { links.Add(new WebLink(GetAbsoluteUrl(mm.Groups["Url"].Value), null, null, WebLinkType.Javascript, hn.StreamPosition + mm.Index)); } } } } return(links); } }