public async Task <Page> GetAsync(string url) { url = BddJson.NormalizeUrl(url); var page = await GetPageAsync(url); if (page != null) { return(page); } page = new Page(); page.StatusCode = 400; page.Url = BddJson.NormalizeUrl(url); return(page); }
/// <summary> /// Crawls a page. /// </summary> /// <param name="url">The url to crawl.</param> private async Task CrawlPageAsync(InputSite inputSite, string url, DateTime dateTime) { url = BddJson.NormalizeUrl(url); if (!PageHasBeenCrawled(url)) { var response = await GetWebTextAsync(url); // TODO remove when server side rendering will be available var loader1 = "id=\"loading-layer\" ng-show=\"loader.isLoading\" class=\"ng-animate ng-hide-animate ng-hide-add\""; var loader1Replace = "id=\"loading-layer\" ng-show=\"loader.isLoading\" class=\"ng-hide\""; var loader2 = "id=\"loading\" ng-show=\"loader.isLoading\" class=\"ng-animate ng-hide-animate ng-hide-add\""; var loader2Replace = "id=\"loading\" ng-show=\"loader.isLoading\" class=\"ng-hide\""; var page = new Page(); page.Text = response.Content.Replace("ng-cloak", "").Replace("ng-app=\"mw\"", "").Replace("ng-enter", "").Replace(loader1, loader1Replace).Replace(loader2, loader2Replace); page.StatusCode = response.StatusCode; page.Url = BddJson.NormalizeUrl(url); page.SiteId = inputSite.SiteId; page.Date = dateTime; _pages.Add(page); if (response.StatusCode == 200) { var linkParser = new LinkParser(); linkParser.ParseLinks(inputSite, page, url); var classParser = new CSSClassParser(); classParser.ParseForCssClasses(page); //Add data to main data lists AddRangeButNoDuplicates(_externalUrls, linkParser.ExternalUrls); AddRangeButNoDuplicates(_otherUrls, linkParser.OtherUrls); AddRangeButNoDuplicates(_failedUrls, linkParser.BadUrls); AddRangeButNoDuplicates(_classes, classParser.Classes); foreach (var exception in linkParser.Exceptions) { _exceptions.Add(exception); } //Crawl all the links found on the page. foreach (var link in linkParser.GoodUrls) { var formattedLink = link; try { formattedLink = FixPath(inputSite, formattedLink); if (formattedLink != string.Empty) { await CrawlPageAsync(inputSite, formattedLink, dateTime); } } catch (Exception exc) { _failedUrls.Add(formattedLink + " (on page at url " + url + ") - " + exc.Message); } } } } }
/// <summary> /// Parses a page looking for links. /// </summary> /// <param name="page">The page whose text is to be parsed.</param> /// <param name="sourceUrl">The source url of the page.</param> public void ParseLinks(InputSite inputSite, Page page, string sourceUrl) { if (sourceUrl.EndsWith(".xml")) { var matches = Regex.Matches(page.Text, _SITEMAP_REGEX); for (var i = 0; i <= matches.Count - 1; i++) { var anchorMatch = matches[i]; var foundHref = BddJson.NormalizeUrl(anchorMatch.Value); // TODO faire un Regex Match foundHref = foundHref.Replace("<loc>", ""); foundHref = foundHref.Replace("</loc>", ""); if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref)) { GoodUrls.Add(foundHref); } } } else { var matches = Regex.Matches(page.Text, _LINK_REGEX); for (var i = 0; i <= matches.Count - 1; i++) { var anchorMatch = matches[i]; if (anchorMatch.Value == string.Empty) { BadUrls.Add("Blank url value on page " + sourceUrl); continue; } string foundHref = null; try { foundHref = anchorMatch.Value.Replace("href=\"", ""); foundHref = foundHref.Substring(0, foundHref.IndexOf("\"")); } catch (Exception exc) { Exceptions.Add("Error parsing matched href: " + exc.Message); } foundHref = BddJson.NormalizeUrl(foundHref); if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref)) { if (IsExternalUrl(inputSite, foundHref)) { ExternalUrls.Add(foundHref); } else if (!IsAWebPage(foundHref)) { foundHref = Crawler.FixPath(inputSite, sourceUrl); OtherUrls.Add(foundHref); } else { GoodUrls.Add(foundHref); } } } } }