Esempio n. 1
0
        /// <summary>
        ///     Is the url to an external site?
        /// </summary>
        /// <param name="url">The url whose externality of destination is in question.</param>
        /// <returns>Boolean indicating whether or not the url is to an external destination.</returns>
        private static bool IsExternalUrl(InputSite inputSite, string url)
        {
            if (url.IndexOf(inputSite.BaseUrl) > -1)
            {
                return(false);
            }
            if (url.Contains("http://") || url.Contains("www") || url.Contains("https://"))
            {
                return(true);
            }

            return(false);
        }
Esempio n. 2
0
        /// <summary>
        ///     Fixes a path. Makes sure it is a fully functional absolute url.
        /// </summary>
        /// <param name="originatingUrl">The url that the link was found in.</param>
        /// <param name="link">The link to be fixed up.</param>
        /// <returns>A fixed url that is fit to be fetched.</returns>
        public static string FixPath(InputSite inputSite, string originatingUrl)
        {
            if (originatingUrl.Contains("#"))
            {
                originatingUrl = originatingUrl.Split('#')[0];
            }

            if (!originatingUrl.Contains("http://") && !originatingUrl.Contains("https://"))
            {
                return(UrlHelper.Concat(inputSite.BaseUrl, originatingUrl));
            }

            return(originatingUrl);
        }
Esempio n. 3
0
        /// <summary>
        ///     Crawls a site.
        /// </summary>
        public async Task CrawlSiteAsync(InputSite inputSite, string[] urls)
        {
            _logger.LogInformation("Beginning crawl:" + inputSite.BaseUrl);
            var dateTime = DateTime.Now;

            foreach (var url in urls)
            {
                _logger.LogInformation("before crawl:" + url);
                await CrawlPageAsync(inputSite, url, dateTime);

                _logger.LogInformation("after crawl:" + url);
            }

            _logger.LogInformation("before clear:" + inputSite.SiteId);
            await _bdd.ClearAsync(inputSite.SiteId, dateTime);

            //  foreach (var page in _pages)
            {
                //_logger.LogInformation("before save:" + page.Url);
                await _bdd.SaveAsync(_pages);
            }

            _logger.LogInformation("Finished crawl:" + inputSite.BaseUrl);
        }
Esempio n. 4
0
        /// <summary>
        ///     Crawls a page.
        /// </summary>
        /// <param name="url">The url to crawl.</param>
        private async Task CrawlPageAsync(InputSite inputSite, string url, DateTime dateTime)
        {
            url = BddJson.NormalizeUrl(url);
            if (!PageHasBeenCrawled(url))
            {
                var response = await GetWebTextAsync(url);

                // TODO remove when server side rendering will be available
                var loader1        = "id=\"loading-layer\" ng-show=\"loader.isLoading\" class=\"ng-animate ng-hide-animate ng-hide-add\"";
                var loader1Replace = "id=\"loading-layer\" ng-show=\"loader.isLoading\" class=\"ng-hide\"";
                var loader2        = "id=\"loading\" ng-show=\"loader.isLoading\" class=\"ng-animate ng-hide-animate ng-hide-add\"";
                var loader2Replace = "id=\"loading\" ng-show=\"loader.isLoading\" class=\"ng-hide\"";

                var page = new Page();
                page.Text       = response.Content.Replace("ng-cloak", "").Replace("ng-app=\"mw\"", "").Replace("ng-enter", "").Replace(loader1, loader1Replace).Replace(loader2, loader2Replace);
                page.StatusCode = response.StatusCode;
                page.Url        = BddJson.NormalizeUrl(url);
                page.SiteId     = inputSite.SiteId;
                page.Date       = dateTime;

                _pages.Add(page);

                if (response.StatusCode == 200)
                {
                    var linkParser = new LinkParser();
                    linkParser.ParseLinks(inputSite, page, url);

                    var classParser = new CSSClassParser();
                    classParser.ParseForCssClasses(page);

                    //Add data to main data lists
                    AddRangeButNoDuplicates(_externalUrls, linkParser.ExternalUrls);
                    AddRangeButNoDuplicates(_otherUrls, linkParser.OtherUrls);
                    AddRangeButNoDuplicates(_failedUrls, linkParser.BadUrls);
                    AddRangeButNoDuplicates(_classes, classParser.Classes);

                    foreach (var exception in linkParser.Exceptions)
                    {
                        _exceptions.Add(exception);
                    }

                    //Crawl all the links found on the page.
                    foreach (var link in linkParser.GoodUrls)
                    {
                        var formattedLink = link;
                        try
                        {
                            formattedLink = FixPath(inputSite, formattedLink);

                            if (formattedLink != string.Empty)
                            {
                                await CrawlPageAsync(inputSite, formattedLink, dateTime);
                            }
                        }
                        catch (Exception exc)
                        {
                            _failedUrls.Add(formattedLink + " (on page at url " + url + ") - " + exc.Message);
                        }
                    }
                }
            }
        }
Esempio n. 5
0
        /// <summary>
        ///     Parses a page looking for links.
        /// </summary>
        /// <param name="page">The page whose text is to be parsed.</param>
        /// <param name="sourceUrl">The source url of the page.</param>
        public void ParseLinks(InputSite inputSite, Page page, string sourceUrl)
        {
            if (sourceUrl.EndsWith(".xml"))
            {
                var matches = Regex.Matches(page.Text, _SITEMAP_REGEX);

                for (var i = 0; i <= matches.Count - 1; i++)
                {
                    var anchorMatch = matches[i];
                    var foundHref   = BddJson.NormalizeUrl(anchorMatch.Value);
                    // TODO faire un Regex Match
                    foundHref = foundHref.Replace("<loc>", "");
                    foundHref = foundHref.Replace("</loc>", "");

                    if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref))
                    {
                        GoodUrls.Add(foundHref);
                    }
                }
            }
            else
            {
                var matches = Regex.Matches(page.Text, _LINK_REGEX);

                for (var i = 0; i <= matches.Count - 1; i++)
                {
                    var anchorMatch = matches[i];

                    if (anchorMatch.Value == string.Empty)
                    {
                        BadUrls.Add("Blank url value on page " + sourceUrl);
                        continue;
                    }

                    string foundHref = null;
                    try
                    {
                        foundHref = anchorMatch.Value.Replace("href=\"", "");
                        foundHref = foundHref.Substring(0, foundHref.IndexOf("\""));
                    }
                    catch (Exception exc)
                    {
                        Exceptions.Add("Error parsing matched href: " + exc.Message);
                    }

                    foundHref = BddJson.NormalizeUrl(foundHref);

                    if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref))
                    {
                        if (IsExternalUrl(inputSite, foundHref))
                        {
                            ExternalUrls.Add(foundHref);
                        }
                        else if (!IsAWebPage(foundHref))
                        {
                            foundHref = Crawler.FixPath(inputSite, sourceUrl);
                            OtherUrls.Add(foundHref);
                        }
                        else
                        {
                            GoodUrls.Add(foundHref);
                        }
                    }
                }
            }
        }