예제 #1
0
        public async Task <Page> GetAsync(string url)
        {
            url = BddJson.NormalizeUrl(url);
            var page = await GetPageAsync(url);

            if (page != null)
            {
                return(page);
            }

            page            = new Page();
            page.StatusCode = 400;
            page.Url        = BddJson.NormalizeUrl(url);

            return(page);
        }
예제 #2
0
        /// <summary>
        ///     Crawls a page.
        /// </summary>
        /// <param name="url">The url to crawl.</param>
        private async Task CrawlPageAsync(InputSite inputSite, string url, DateTime dateTime)
        {
            url = BddJson.NormalizeUrl(url);
            if (!PageHasBeenCrawled(url))
            {
                var response = await GetWebTextAsync(url);

                // TODO remove when server side rendering will be available
                var loader1        = "id=\"loading-layer\" ng-show=\"loader.isLoading\" class=\"ng-animate ng-hide-animate ng-hide-add\"";
                var loader1Replace = "id=\"loading-layer\" ng-show=\"loader.isLoading\" class=\"ng-hide\"";
                var loader2        = "id=\"loading\" ng-show=\"loader.isLoading\" class=\"ng-animate ng-hide-animate ng-hide-add\"";
                var loader2Replace = "id=\"loading\" ng-show=\"loader.isLoading\" class=\"ng-hide\"";

                var page = new Page();
                page.Text       = response.Content.Replace("ng-cloak", "").Replace("ng-app=\"mw\"", "").Replace("ng-enter", "").Replace(loader1, loader1Replace).Replace(loader2, loader2Replace);
                page.StatusCode = response.StatusCode;
                page.Url        = BddJson.NormalizeUrl(url);
                page.SiteId     = inputSite.SiteId;
                page.Date       = dateTime;

                _pages.Add(page);

                if (response.StatusCode == 200)
                {
                    var linkParser = new LinkParser();
                    linkParser.ParseLinks(inputSite, page, url);

                    var classParser = new CSSClassParser();
                    classParser.ParseForCssClasses(page);

                    //Add data to main data lists
                    AddRangeButNoDuplicates(_externalUrls, linkParser.ExternalUrls);
                    AddRangeButNoDuplicates(_otherUrls, linkParser.OtherUrls);
                    AddRangeButNoDuplicates(_failedUrls, linkParser.BadUrls);
                    AddRangeButNoDuplicates(_classes, classParser.Classes);

                    foreach (var exception in linkParser.Exceptions)
                    {
                        _exceptions.Add(exception);
                    }

                    //Crawl all the links found on the page.
                    foreach (var link in linkParser.GoodUrls)
                    {
                        var formattedLink = link;
                        try
                        {
                            formattedLink = FixPath(inputSite, formattedLink);

                            if (formattedLink != string.Empty)
                            {
                                await CrawlPageAsync(inputSite, formattedLink, dateTime);
                            }
                        }
                        catch (Exception exc)
                        {
                            _failedUrls.Add(formattedLink + " (on page at url " + url + ") - " + exc.Message);
                        }
                    }
                }
            }
        }
예제 #3
0
        /// <summary>
        ///     Parses a page looking for links.
        /// </summary>
        /// <param name="page">The page whose text is to be parsed.</param>
        /// <param name="sourceUrl">The source url of the page.</param>
        public void ParseLinks(InputSite inputSite, Page page, string sourceUrl)
        {
            if (sourceUrl.EndsWith(".xml"))
            {
                var matches = Regex.Matches(page.Text, _SITEMAP_REGEX);

                for (var i = 0; i <= matches.Count - 1; i++)
                {
                    var anchorMatch = matches[i];
                    var foundHref   = BddJson.NormalizeUrl(anchorMatch.Value);
                    // TODO faire un Regex Match
                    foundHref = foundHref.Replace("<loc>", "");
                    foundHref = foundHref.Replace("</loc>", "");

                    if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref))
                    {
                        GoodUrls.Add(foundHref);
                    }
                }
            }
            else
            {
                var matches = Regex.Matches(page.Text, _LINK_REGEX);

                for (var i = 0; i <= matches.Count - 1; i++)
                {
                    var anchorMatch = matches[i];

                    if (anchorMatch.Value == string.Empty)
                    {
                        BadUrls.Add("Blank url value on page " + sourceUrl);
                        continue;
                    }

                    string foundHref = null;
                    try
                    {
                        foundHref = anchorMatch.Value.Replace("href=\"", "");
                        foundHref = foundHref.Substring(0, foundHref.IndexOf("\""));
                    }
                    catch (Exception exc)
                    {
                        Exceptions.Add("Error parsing matched href: " + exc.Message);
                    }

                    foundHref = BddJson.NormalizeUrl(foundHref);

                    if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref))
                    {
                        if (IsExternalUrl(inputSite, foundHref))
                        {
                            ExternalUrls.Add(foundHref);
                        }
                        else if (!IsAWebPage(foundHref))
                        {
                            foundHref = Crawler.FixPath(inputSite, sourceUrl);
                            OtherUrls.Add(foundHref);
                        }
                        else
                        {
                            GoodUrls.Add(foundHref);
                        }
                    }
                }
            }
        }