/// <summary> /// Is the url to an external site? /// </summary> /// <param name="url">The url whose externality of destination is in question.</param> /// <returns>Boolean indicating whether or not the url is to an external destination.</returns> private static bool IsExternalUrl(InputSite inputSite, string url) { if (url.IndexOf(inputSite.BaseUrl) > -1) { return(false); } if (url.Contains("http://") || url.Contains("www") || url.Contains("https://")) { return(true); } return(false); }
/// <summary> /// Fixes a path. Makes sure it is a fully functional absolute url. /// </summary> /// <param name="originatingUrl">The url that the link was found in.</param> /// <param name="link">The link to be fixed up.</param> /// <returns>A fixed url that is fit to be fetched.</returns> public static string FixPath(InputSite inputSite, string originatingUrl) { if (originatingUrl.Contains("#")) { originatingUrl = originatingUrl.Split('#')[0]; } if (!originatingUrl.Contains("http://") && !originatingUrl.Contains("https://")) { return(UrlHelper.Concat(inputSite.BaseUrl, originatingUrl)); } return(originatingUrl); }
/// <summary> /// Crawls a site. /// </summary> public async Task CrawlSiteAsync(InputSite inputSite, string[] urls) { _logger.LogInformation("Beginning crawl:" + inputSite.BaseUrl); var dateTime = DateTime.Now; foreach (var url in urls) { _logger.LogInformation("before crawl:" + url); await CrawlPageAsync(inputSite, url, dateTime); _logger.LogInformation("after crawl:" + url); } _logger.LogInformation("before clear:" + inputSite.SiteId); await _bdd.ClearAsync(inputSite.SiteId, dateTime); // foreach (var page in _pages) { //_logger.LogInformation("before save:" + page.Url); await _bdd.SaveAsync(_pages); } _logger.LogInformation("Finished crawl:" + inputSite.BaseUrl); }
/// <summary> /// Crawls a page. /// </summary> /// <param name="url">The url to crawl.</param> private async Task CrawlPageAsync(InputSite inputSite, string url, DateTime dateTime) { url = BddJson.NormalizeUrl(url); if (!PageHasBeenCrawled(url)) { var response = await GetWebTextAsync(url); // TODO remove when server side rendering will be available var loader1 = "id=\"loading-layer\" ng-show=\"loader.isLoading\" class=\"ng-animate ng-hide-animate ng-hide-add\""; var loader1Replace = "id=\"loading-layer\" ng-show=\"loader.isLoading\" class=\"ng-hide\""; var loader2 = "id=\"loading\" ng-show=\"loader.isLoading\" class=\"ng-animate ng-hide-animate ng-hide-add\""; var loader2Replace = "id=\"loading\" ng-show=\"loader.isLoading\" class=\"ng-hide\""; var page = new Page(); page.Text = response.Content.Replace("ng-cloak", "").Replace("ng-app=\"mw\"", "").Replace("ng-enter", "").Replace(loader1, loader1Replace).Replace(loader2, loader2Replace); page.StatusCode = response.StatusCode; page.Url = BddJson.NormalizeUrl(url); page.SiteId = inputSite.SiteId; page.Date = dateTime; _pages.Add(page); if (response.StatusCode == 200) { var linkParser = new LinkParser(); linkParser.ParseLinks(inputSite, page, url); var classParser = new CSSClassParser(); classParser.ParseForCssClasses(page); //Add data to main data lists AddRangeButNoDuplicates(_externalUrls, linkParser.ExternalUrls); AddRangeButNoDuplicates(_otherUrls, linkParser.OtherUrls); AddRangeButNoDuplicates(_failedUrls, linkParser.BadUrls); AddRangeButNoDuplicates(_classes, classParser.Classes); foreach (var exception in linkParser.Exceptions) { _exceptions.Add(exception); } //Crawl all the links found on the page. foreach (var link in linkParser.GoodUrls) { var formattedLink = link; try { formattedLink = FixPath(inputSite, formattedLink); if (formattedLink != string.Empty) { await CrawlPageAsync(inputSite, formattedLink, dateTime); } } catch (Exception exc) { _failedUrls.Add(formattedLink + " (on page at url " + url + ") - " + exc.Message); } } } } }
/// <summary> /// Parses a page looking for links. /// </summary> /// <param name="page">The page whose text is to be parsed.</param> /// <param name="sourceUrl">The source url of the page.</param> public void ParseLinks(InputSite inputSite, Page page, string sourceUrl) { if (sourceUrl.EndsWith(".xml")) { var matches = Regex.Matches(page.Text, _SITEMAP_REGEX); for (var i = 0; i <= matches.Count - 1; i++) { var anchorMatch = matches[i]; var foundHref = BddJson.NormalizeUrl(anchorMatch.Value); // TODO faire un Regex Match foundHref = foundHref.Replace("<loc>", ""); foundHref = foundHref.Replace("</loc>", ""); if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref)) { GoodUrls.Add(foundHref); } } } else { var matches = Regex.Matches(page.Text, _LINK_REGEX); for (var i = 0; i <= matches.Count - 1; i++) { var anchorMatch = matches[i]; if (anchorMatch.Value == string.Empty) { BadUrls.Add("Blank url value on page " + sourceUrl); continue; } string foundHref = null; try { foundHref = anchorMatch.Value.Replace("href=\"", ""); foundHref = foundHref.Substring(0, foundHref.IndexOf("\"")); } catch (Exception exc) { Exceptions.Add("Error parsing matched href: " + exc.Message); } foundHref = BddJson.NormalizeUrl(foundHref); if (!IsBad(foundHref) && !GoodUrls.Contains(foundHref)) { if (IsExternalUrl(inputSite, foundHref)) { ExternalUrls.Add(foundHref); } else if (!IsAWebPage(foundHref)) { foundHref = Crawler.FixPath(inputSite, sourceUrl); OtherUrls.Add(foundHref); } else { GoodUrls.Add(foundHref); } } } } }