Example #1
0
 static void find_links(List <WebLink> links, WebLinkType link_type, Cliver.DataSifter.Parser parser, string url, string page)
 {
     lock (static_lock_object)
     {
         Cliver.DataSifter.Capture gc = parser.Parse(page);
         foreach (Cliver.DataSifter.Capture tag in gc["Tag"])
         {
             Cliver.DataSifter.Capture html = tag.FirstOf("Html");
             if (html == null)
             {
                 continue;
             }
             string u = html.ValueOf("Url");
             if (u == null)
             {
                 continue;
             }
             u = Spider.GetAbsoluteUrl(u, url);
             if (u == null)
             {
                 continue;
             }
             links.Add(new WebLink(u, tag.ValueOf("Content"), html.ValueOf("Title"), link_type, tag.Index));
         }
     }
 }
Example #2
0
        /// <summary>
        /// Finds web links in the page
        /// </summary>
        /// <param name="url">absolute url of parsed page</param>
        /// <param name="page">string to be parsed for web links</param>
        /// <returns>absolute links</returns>      
        public static List<WebLink> GetWebLinks(string url, string page, WebLinkType link_type)
        {
            page = Spider.PreparePage(page);
            url = Spider.GetBaseUri(url, page).ToString();

            List<WebLink> links = new List<WebLink>();
            
            //anchors
            if ((link_type & WebLinkType.Anchor) == WebLinkType.Anchor)
                find_links(links, WebLinkType.Anchor, HtmlAnchors, url, page);

            //areas
            if ((link_type & WebLinkType.Area) == WebLinkType.Area)
                find_links(links, WebLinkType.Area, HtmlAreas, url, page);

            //meta tag url
            if ((link_type & WebLinkType.MetaTag) == WebLinkType.MetaTag)
            {
                foreach (Match mm in Regex.Matches(page, @"<META [^>]*CONTENT\s*=\s*([""']).+?;\s*URL\s*=\s*(?'Url'.*?)(?:\1|>)", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase))
                    links.Add(new WebLink(mm.Groups["Url"].Value, null, null, WebLinkType.MetaTag, mm.Index));
            }

            ////images
            //if ((link_type & WebLinkType.Image) == WebLinkType.Image)
            //{
            //    m = Regex.Match(page, @"<(?'Body'(?'Tag'img)\s(?:[^>]*?\s)?src\s*=\s*(?'quotation'[\'\""])?(?'Url'.*?)(?:\k'quotation'[^>]*?)?/\s*>", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase);
            //    while (m.Success)
            //    {

            //        string title = null;
            //        Match mm = Regex.Match(m.Groups["Body"].Value, @"\salt\s*=\s*(?'quotation'[\'\""])?(?'Title'.*?)(?:\k'quotation'|$)", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase);
            //        if (mm.Success)
            //            title = mm.Groups["Title"].Value;

            //        links[m.Groups["Url"].Value] = new WebLink(GetAbsoluteUrl(m.Groups["Url"].Value, url), null, title, WebLinkType.Image);
            //        m = m.NextMatch();
            //    }
            //}

            //frames
            if ((link_type & WebLinkType.Frame) == WebLinkType.Frame)
                find_links(links, WebLinkType.Frame, HtmlFrames, url, page);

            //forms
            if ((link_type & WebLinkType.Form) == WebLinkType.Form)
                find_links(links, WebLinkType.Form, HtmlForms, url, page);

            //javascript links
            if ((link_type & WebLinkType.Javascript) == WebLinkType.Javascript)
            {
               foreach(Match mm in Regex.Matches(page, @"(?:location.href|window.open)\((['""])(?'Url'[^\>\;]+?)\1\)", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase))
                    links.Add(new WebLink(mm.Groups["Url"].Value, null, null, WebLinkType.Javascript, mm.Index));
            }

            //WebLink[] wls = new WebLink[links.Count];
            //links.CopyTo(wls, 0);
            //return wls;
            return links;
        }
Example #3
0
 internal WebLink(string url, string text, string title, WebLinkType link_type, int index)
 {
     if (url != null)
         Url = HttpUtility.HtmlDecode(url).Trim();
     Text = text;
     Title = title;
     WebLinkType = link_type;
     IndexInPreparedPage = index;
 }
Example #4
0
 internal WebLink(string url, string text, string title, WebLinkType link_type, int index)
 {
     if (url != null)
     {
         Url = HttpUtility.HtmlDecode(url).Trim();
     }
     Text                = text;
     Title               = title;
     WebLinkType         = link_type;
     IndexInPreparedPage = index;
 }
Example #5
0
 static void find_links(List<WebLink> links, WebLinkType link_type, Cliver.DataSifter.Parser parser, string url, string page)
 {
     lock (static_lock_object)
     {
         Cliver.DataSifter.Capture gc = parser.Parse(page);
         foreach (Cliver.DataSifter.Capture tag in gc["Tag"])
         {
             Cliver.DataSifter.Capture html = tag.FirstOf("Html");
             if (html == null)
                 continue;
             string u = html.ValueOf("Url");
             if (u == null)
                 continue;
             u = Spider.GetAbsoluteUrl(u, url);
             if (u == null)
                 continue;
             links.Add(new WebLink(u, tag.ValueOf("Content"), html.ValueOf("Title"), link_type, tag.Index));
         }
     }
 }
Example #6
0
        /// <summary>
        /// Finds web links in the page
        /// </summary>
        /// <param name="url">absolute url of parsed page</param>
        /// <param name="page">string to be parsed for web links</param>
        /// <returns>absolute links</returns>
        public static List <WebLink> GetWebLinks(string url, string page, WebLinkType link_type)
        {
            page = Spider.PreparePage(page);
            url  = Spider.GetBaseUri(url, page).ToString();

            List <WebLink> links = new List <WebLink>();

            //anchors
            if ((link_type & WebLinkType.Anchor) == WebLinkType.Anchor)
            {
                find_links(links, WebLinkType.Anchor, HtmlAnchors, url, page);
            }

            //areas
            if ((link_type & WebLinkType.Area) == WebLinkType.Area)
            {
                find_links(links, WebLinkType.Area, HtmlAreas, url, page);
            }

            //meta tag url
            if ((link_type & WebLinkType.MetaTag) == WebLinkType.MetaTag)
            {
                foreach (Match mm in Regex.Matches(page, @"<META [^>]*CONTENT\s*=\s*([""']).+?;\s*URL\s*=\s*(?'Url'.*?)(?:\1|>)", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase))
                {
                    links.Add(new WebLink(mm.Groups["Url"].Value, null, null, WebLinkType.MetaTag, mm.Index));
                }
            }

            ////images
            //if ((link_type & WebLinkType.Image) == WebLinkType.Image)
            //{
            //    m = Regex.Match(page, @"<(?'Body'(?'Tag'img)\s(?:[^>]*?\s)?src\s*=\s*(?'quotation'[\'\""])?(?'Url'.*?)(?:\k'quotation'[^>]*?)?/\s*>", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase);
            //    while (m.Success)
            //    {

            //        string title = null;
            //        Match mm = Regex.Match(m.Groups["Body"].Value, @"\salt\s*=\s*(?'quotation'[\'\""])?(?'Title'.*?)(?:\k'quotation'|$)", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase);
            //        if (mm.Success)
            //            title = mm.Groups["Title"].Value;

            //        links[m.Groups["Url"].Value] = new WebLink(GetAbsoluteUrl(m.Groups["Url"].Value, url), null, title, WebLinkType.Image);
            //        m = m.NextMatch();
            //    }
            //}

            //frames
            if ((link_type & WebLinkType.Frame) == WebLinkType.Frame)
            {
                find_links(links, WebLinkType.Frame, HtmlFrames, url, page);
            }

            //forms
            if ((link_type & WebLinkType.Form) == WebLinkType.Form)
            {
                find_links(links, WebLinkType.Form, HtmlForms, url, page);
            }

            //javascript links
            if ((link_type & WebLinkType.Javascript) == WebLinkType.Javascript)
            {
                foreach (Match mm in Regex.Matches(page, @"(?:location.href|window.open)\((['""])(?'Url'[^\>\;]+?)\1\)", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase))
                {
                    links.Add(new WebLink(mm.Groups["Url"].Value, null, null, WebLinkType.Javascript, mm.Index));
                }
            }

            //WebLink[] wls = new WebLink[links.Count];
            //links.CopyTo(wls, 0);
            //return wls;
            return(links);
        }
Example #7
0
        /// <summary>
        /// Finds web links in the page
        /// </summary>
        /// <returns>absolute links</returns>      
        public List<WebLink> GetWebLinks(WebLinkType link_type)
        {
            lock (parent_document)
            {
                List<WebLink> links = new List<WebLink>();

                //anchors
                if ((link_type & WebLinkType.Anchor) == WebLinkType.Anchor)
                {
                    HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//a");
                    if (hnc != null)
                    {
                        foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x))
                            if (hn.Attributes["href"] != null)
                                links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["href"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Anchor, hn.StreamPosition));
                    }
                }

                //areas
                if ((link_type & WebLinkType.Area) == WebLinkType.Area)
                {
                    HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//area");
                    if (hnc != null)
                    {
                        foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x))
                            if (hn.Attributes["src"] != null)
                                links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["src"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Area, hn.StreamPosition));
                    }
                }

                //meta tag url
                if ((link_type & WebLinkType.MetaTag) == WebLinkType.MetaTag)
                {
                    HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//meta");
                    if (hnc != null)
                    {
                        foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x))
                            if (hn.Attributes["url"] != null)
                                links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["url"].Value), hn.InnerText, null, WebLinkType.MetaTag, hn.StreamPosition));
                    }
                }

                //images
                if ((link_type & WebLinkType.Image) == WebLinkType.Image)
                {
                    HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//img");
                    if (hnc != null)
                    {
                        foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x))
                            if (hn.Attributes["src"] != null)
                                links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["src"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Image, hn.StreamPosition));
                    }
                }

                //frames
                if ((link_type & WebLinkType.Frame) == WebLinkType.Frame)
                {
                    HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//frame");
                    if (hnc != null)
                    {
                        foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x))
                            if (hn.Attributes["src"] != null)
                                links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["src"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Frame, hn.StreamPosition));
                    }
                }

                //forms
                if ((link_type & WebLinkType.Form) == WebLinkType.Form)
                {
                    HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//form");
                    if (hnc != null)
                    {
                        foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x))
                            if(hn.Attributes["action"] != null)
                                links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["action"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Form, hn.StreamPosition));
                    }
                }

                //javascript links
                if ((link_type & WebLinkType.Javascript) == WebLinkType.Javascript)
                {
                    HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//script");
                    if (hnc != null)
                    {
                        foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x))
                        {
                            foreach (Match mm in Regex.Matches(hn.InnerText, @"(?:location.href|window.open)\((['""])(?'Url'[^\>\;]+?)\1\)", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase))
                                links.Add(new WebLink(GetAbsoluteUrl(mm.Groups["Url"].Value), null, null, WebLinkType.Javascript, hn.StreamPosition + mm.Index));
                        }
                    }
                }

                return links;
            }
        }
Example #8
0
        /// <summary>
        /// Finds web links in the page
        /// </summary>
        /// <returns>absolute links</returns>
        public List <WebLink> GetWebLinks(WebLinkType link_type)
        {
            lock (parent_document)
            {
                List <WebLink> links = new List <WebLink>();

                //anchors
                if ((link_type & WebLinkType.Anchor) == WebLinkType.Anchor)
                {
                    HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//a");
                    if (hnc != null)
                    {
                        foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x))
                        {
                            if (hn.Attributes["href"] != null)
                            {
                                links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["href"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Anchor, hn.StreamPosition));
                            }
                        }
                    }
                }

                //areas
                if ((link_type & WebLinkType.Area) == WebLinkType.Area)
                {
                    HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//area");
                    if (hnc != null)
                    {
                        foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x))
                        {
                            if (hn.Attributes["src"] != null)
                            {
                                links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["src"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Area, hn.StreamPosition));
                            }
                        }
                    }
                }

                //meta tag url
                if ((link_type & WebLinkType.MetaTag) == WebLinkType.MetaTag)
                {
                    HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//meta");
                    if (hnc != null)
                    {
                        foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x))
                        {
                            if (hn.Attributes["url"] != null)
                            {
                                links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["url"].Value), hn.InnerText, null, WebLinkType.MetaTag, hn.StreamPosition));
                            }
                        }
                    }
                }

                //images
                if ((link_type & WebLinkType.Image) == WebLinkType.Image)
                {
                    HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//img");
                    if (hnc != null)
                    {
                        foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x))
                        {
                            if (hn.Attributes["src"] != null)
                            {
                                links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["src"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Image, hn.StreamPosition));
                            }
                        }
                    }
                }

                //frames
                if ((link_type & WebLinkType.Frame) == WebLinkType.Frame)
                {
                    HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//frame");
                    if (hnc != null)
                    {
                        foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x))
                        {
                            if (hn.Attributes["src"] != null)
                            {
                                links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["src"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Frame, hn.StreamPosition));
                            }
                        }
                    }
                }

                //forms
                if ((link_type & WebLinkType.Form) == WebLinkType.Form)
                {
                    HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//form");
                    if (hnc != null)
                    {
                        foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x))
                        {
                            if (hn.Attributes["action"] != null)
                            {
                                links.Add(new WebLink(GetAbsoluteUrl(hn.Attributes["action"].Value), hn.InnerText, hn.Attributes["title"] != null ? hn.Attributes["title"].Value : null, WebLinkType.Form, hn.StreamPosition));
                            }
                        }
                    }
                }

                //javascript links
                if ((link_type & WebLinkType.Javascript) == WebLinkType.Javascript)
                {
                    HtmlAgilityPack.HtmlNodeCollection hnc = parent_document.DocumentNode.SelectNodes("//script");
                    if (hnc != null)
                    {
                        foreach (HtmlAgilityPack.HtmlNode hn in (from x in hnc select x))
                        {
                            foreach (Match mm in Regex.Matches(hn.InnerText, @"(?:location.href|window.open)\((['""])(?'Url'[^\>\;]+?)\1\)", RegexOptions.Compiled | RegexOptions.Multiline | RegexOptions.IgnoreCase))
                            {
                                links.Add(new WebLink(GetAbsoluteUrl(mm.Groups["Url"].Value), null, null, WebLinkType.Javascript, hn.StreamPosition + mm.Index));
                            }
                        }
                    }
                }

                return(links);
            }
        }