示例#1
0
文件: Crawler.cs 项目: noyeem/Wipro2
        private async Task <IEnumerable <Link> > GetRootPageLinksAsync()
        {
            IEnumerable <Link> links = null;
            var html = await GetRootPageHtmlAsync();

            links = CrawlerParser.GetPageLinks(this.RootUri, html);
            if (links != null)
            {
                links = FilterLinks(links);
            }
            else
            {
                return(null);
            }

            foreach (var link in links)
            {
                try
                {
                    if (link != null && !string.IsNullOrEmpty(link.Href))
                    {
                        SiteNode.AddChild(link);
                    }
                }
                catch (Exception ex)
                {
                }
            }
            return(links);
        }
示例#2
0
        private static readonly string[] staticFileExtensions = { ".css", ".ico", ".pdf", ".json", ".xml", ".jpg", ".jpeg", ".bmp", ".gif", ".png", ".js" }; //  etc

        /// <summary>
        /// Get a list of Link objects from html
        /// </summary>
        /// <param name="html">Page Html</param>
        /// <returns>List of Link objects</returns>
        public static IEnumerable <Link> GetPageLinks(string rootUrl, string html)
        {
            if (string.IsNullOrEmpty(html))
            {
                return(null);
            }

            var    list     = new List <Link>();
            string strRegex = @"href\s*=\s*(?:\""(?<1>[^\""]*)\""|(?<1>\\S+))"; //regex: searches anchor tags
            var    regex    = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline);

            //On match found parse the link and store it as Link
            if (regex != null && regex.IsMatch(html))
            {
                foreach (Match match in regex.Matches(html))
                {
                    var link = CrawlerParser.ParseLink(rootUrl, match.Value);

                    //Only add if Link(:href) does not exist in the list
                    if (!list.Exists(x => x.Href.Equals(link.Href)))
                    {
                        list.Add(link);
                    }
                }
            }

            return(list);
        }
示例#3
0
        /// <summary>
        /// Parse Link object from method Crawler.GetPageLinks() retunred html
        /// </summary>
        /// <param name="rootUri"></param>
        /// <param name="raw"></param>
        /// <returns></returns>
        public static Link ParseLink(string rootUri, string raw)
        {
            //Parse URI from  regex match value and perform some cleanups
            var href = CrawlerParser.ParseTextFromQuote(raw);

            // sets true if wipro site
            bool isWipro = CrawlerParser.IsWipro(href);

            // sets true if url is relative
            bool isRelative = CrawlerParser.IsRelativeUrl(href);

            // sets true javascript links
            bool isJs = CrawlerParser.IsJavaScriptOrHashLink(raw, href);

            // sets true if file is static
            bool isStatic = CrawlerParser.IsStatic(href);

            //get filename from url
            string extension = string.Empty;
            string fileName  = null;

            if (isWipro && !isJs && !isStatic)
            {
                fileName = Utilities.GetFileNameFromUrl(rootUri, href, out extension);
            }

            //get file-type from extension
            var type = Utilities.GetHrefType(extension);

            // group[1] value contains Title
            //var title = match.Groups.Count > 0 ? match.Groups[1].Value : "";

            //generate Link object
            var link = new Link
            {
                Raw           = raw,                // store raw search
                Href          = href,               // hyperlink
                IsRelativeUrl = isRelative,
                IsWipro       = isWipro,
                IsJavaScript  = isJs,
                IsStatic      = isStatic,
                Type          = type,
                FileName      = fileName
            };

            return(link);
        }
示例#4
0
文件: Crawler.cs 项目: noyeem/Wipro2
        public IEnumerable <Link> GetPageImages(string html)
        {
            var list     = new List <Link>();
            var strRegex = @"<img.+?src=[""'](.+?)[""'].*?>"; //regex: searches image tags
            var regex    = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline);

            //On match found parse the link and store it as Link
            if (regex.IsMatch(html))
            {
                foreach (Match match in regex.Matches(html))
                {
                    list.Add(new Link
                    {
                        Raw           = match.Value,                                        // store raw search
                        Href          = match.Groups[1].Value,                              // hyperlink
                        IsRelativeUrl = CrawlerParser.IsRelativeUrl(match.Groups[1].Value), // set to true if wipro site
                        IsImage       = true                                                // javascript links
                    });
                }
            }

            return(list);
        }
示例#5
0
文件: Crawler.cs 项目: noyeem/Wipro2
        public IEnumerable <Link> ExtractAll(string url, bool isRoot)
        {
            //sort, remove duplicates
            IEnumerable <Link> links = null;

            if (isRoot)
            {
                var html = GetRootPageHtml();
                if (string.IsNullOrEmpty(html))
                {
                    return(links);
                }

                links = FilterLinks(CrawlerParser.GetPageLinks(url, html));

                foreach (var link in links)
                {
                    if (!this.Links.Contains(link))
                    {
                        this.Links.Add(link);
                    }
                    SiteNode.AddChild(link);
                    ExtractAll(link.Href, false);
                }
            }
            else
            {
                url = CrawlerParser.GetAbsoluteUri(this.RootUri, url);

                var html = GetPageHtml(url);
                if (string.IsNullOrEmpty(html))
                {
                    return(links);
                }

                links = CrawlerParser.GetPageLinks(this.RootUri, html);
                if (links != null)
                {
                    links = FilterLinks(links);
                }
                else
                {
                    return(links);
                }

                foreach (var link in links)
                {
                    var node = this.SiteNode.FindTreeNode(x => x.Data.Href.Equals(link.Href));

                    if (node == null)
                    {
                        this.SiteNode.AddChild(link);
                        if (depthCounter2 < MAX_DEPTH_COUNT)
                        {
                            depthCounter2++;
                            this.Links.AddRange(ExtractAll(link.Href, false));
                        }
                    }
                    else
                    {
                        if (depthCounter2 < MAX_DEPTH_COUNT)
                        {
                            depthCounter2++;
                            node.AddChild(link);
                            link.Links = ExtractAll(link.Href, false);
                        }
                    }
                }
            }
            return(links);
        }
示例#6
0
文件: Crawler.cs 项目: noyeem/Wipro2
        public async Task ExtractAllAsync(string url, bool isRoot)
        {
            //sort, remove duplicates
            IEnumerable <Link> links = null;

            if (isRoot)
            {
                var html = await GetRootPageHtmlAsync();

                links = CrawlerParser.GetPageLinks(this.RootUri, html);
                if (links != null)
                {
                    links = links.Where(x => !x.IsJavaScript && !x.IsStatic && x.IsWipro);
                }
                else
                {
                    return;
                }

                foreach (var link in links)
                {
                    try
                    {
                        if (link != null && !string.IsNullOrEmpty(link.Href))
                        {
                            SiteNode.AddChild(link);
                            if (!blackListUrls.Contains(link.Href))
                            {
                                await ExtractAllAsync(link.Href, false);
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                    }
                }
            }
            else
            {
                url = CrawlerParser.GetAbsoluteUri(this.RootUri, url);
                string html = string.Empty;

                try
                {
                    html = await GetPageAsync(url);
                }
                catch (Exception ex)
                {
                }

                links = CrawlerParser.GetPageLinks(this.RootUri, html);
                if (links != null)
                {
                    links = links.Where(x => !x.IsJavaScript && !x.IsStatic && x.IsWipro);
                }
                else
                {
                    return;
                }

                foreach (var link in links)
                {
                    try
                    {
                        var node = this.SiteNode.FindTreeNode(x => x.Data.Href.Equals(link.Href));

                        if (node == null)
                        {
                            if (!this.SiteNode.Children.Any(x => x.Data.Href.Equals(link.Href)))
                            {
                                this.SiteNode.AddChild(link);
                            }
                            if (!blackListUrls.Contains(link.Href) && link.Href.ToLower() != this.RootUri.ToLower() && depthCounter2 < MAX_DEPTH_COUNT)
                            {
                                depthCounter2++;
                                await ExtractAllAsync(link.Href, false);

                                Console.WriteLine("Href: {0}\nChildren Count: {1}\nElements Index: {2}", link.Href, SiteNode.Children.Count, SiteNode.ElementsIndex.Count);
                            }
                        }
                        else
                        {
                            if (!blackListUrls.Contains(link.Href) && link.Href.ToLower() != this.RootUri.ToLower() && depthCounter2 < MAX_DEPTH_COUNT)
                            {
                                depthCounter2++;
                                if (!node.IsRoot && !node.Children.Any(x => x.Data.Href.Equals(link.Href)))
                                {
                                    node.AddChild(link);
                                }
                                await ExtractAllAsync(link.Href, false);
                            }
                        }
                    }
                    catch (Exception ex)
                    {
                    }
                }
            }
        }
示例#7
0
文件: Crawler.cs 项目: noyeem/Wipro2
        private async Task <IEnumerable <Link> > GetChildPagesLinksAsync(string url)
        {
            IEnumerable <Link> links = null;

            url = CrawlerParser.GetAbsoluteUri(this.RootUri, url);
            string html = string.Empty;

            try
            {
                html = await GetPageAsync(url);
            }
            catch (Exception ex)
            {
            }

            links = CrawlerParser.GetPageLinks(this.RootUri, html);
            if (links != null)
            {
                links = FilterLinks(links);
            }
            else
            {
                return(null);
            }

            Parallel.ForEach(links, async link =>
            {
                try
                {
                    var node = this.SiteNode.FindTreeNode(x => x.Data.Href.Equals(link.Href));

                    if (node == null)
                    {
                        if (!this.SiteNode.Children.Any(x => x.Data.Href.Equals(link.Href)))
                        {
                            this.SiteNode.AddChild(link);
                        }
                        if (depthCounter2 < MAX_DEPTH_COUNT)
                        {
                            depthCounter2++;
                            link.Links = await GetChildPagesLinksAsync(link.Href);
                            Debug.WriteLine("Href: {0}\nChildren Count: {1}\nElements Index: {2}\nDepth: {3}", link.Href, SiteNode.Children.Count, SiteNode.ElementsIndex.Count, depthCounter2);
                        }
                    }
                    else
                    {
                        if (depthCounter2 < MAX_DEPTH_COUNT)
                        {
                            depthCounter2++;
                            if (!node.IsRoot && !node.Children.Any(x => x.Data.Href.Equals(link.Href)))
                            {
                                node.AddChild(link);
                            }
                            link.Links = await GetChildPagesLinksAsync(link.Href);
                            Debug.WriteLine("Href: {0}\nChildren Count: {1}\nElements Index: {2}\nDepth: {3}", link.Href, SiteNode.Children.Count, SiteNode.ElementsIndex.Count, depthCounter2);
                        }
                    }
                }
                catch (Exception ex)
                {
                }
            });

            return(links);
        }