public void Download(PageLevel pageLevel, SiteDownloadContext siteDownloadContext) { var absolutePath = new Uri(pageLevel.Url).AbsolutePath; var pageName = absolutePath.Replace("/", "-"); var page = new Page(siteDownloadContext.Site, pageName) { Routes = new[] { new PageRoute() { Identifier = absolutePath } }, IsDefault = pageLevel.Level == 0 }; if (_pageProvider.Get(page) == null) { var text = _httpClient.DownloadString(pageLevel.Url); if (!string.IsNullOrEmpty(text)) { var pageDownloadContext = new PageDownloadContext(siteDownloadContext, pageLevel, text); foreach (var analyzer in _analyzers) { analyzer.Analyze(pageDownloadContext); } page.Html = pageDownloadContext.HtmlDocument.DocumentNode.InnerHtml; _pageProvider.Add(page); siteDownloadContext.DownloadedPages.Add(pageLevel); } } }
public PageDownloadContext(SiteDownloadContext siteDownloadContext, PageLevel pageLevel, string pageHtml) { this.SiteDownloadContext = siteDownloadContext; this.PageLevel = pageLevel; this.PageHtml = pageHtml; HtmlDocument = new HtmlDocument(); HtmlDocument.LoadHtml(pageHtml); }
public void Analyze(PageDownloadContext context) { var links = context.HtmlDocument.DocumentNode.Descendants() .Where(lnks => lnks.Name == "a" && lnks.Attributes["href"] != null && lnks.InnerText.Trim().Length > 0); //.Select(lnks => UriHelper.GetInsideAbsoluteUrl(context.PageUrl, lnks.Attributes["href"].Value)) //.Where(it => !string.IsNullOrEmpty(it)); if (context.PageLevel.Level < context.SiteDownloadContext.Options.Deep) { int count = 0; foreach (var link in links) { var insideUrl = UriHelper.GetInsideAbsoluteUrl(context.PageLevel.Url, link.Attributes["href"].Value); if (!string.IsNullOrEmpty(insideUrl)) { if (count < context.SiteDownloadContext.Options.Pages) { var absolutePath = new Uri(insideUrl).AbsolutePath; link.Attributes["href"].Value = "/" + SiteExtensions.PREFIX_FRONT_PREVIEW_URL + context.SiteDownloadContext.Site.AbsoluteName + absolutePath; var nextPageLevel = new PageLevel(insideUrl, context.PageLevel.Level + 1); if (!new PageLevelComparer().Equals(context.PageLevel, nextPageLevel) && !context.SiteDownloadContext.DownloadedPages.Contains(nextPageLevel, new PageLevelComparer()) && !context.SiteDownloadContext.DownloadQueue.Contains(nextPageLevel, new PageLevelComparer())) { context.SiteDownloadContext.DownloadQueue.Enqueue(nextPageLevel); count++; } } else { link.Attributes["href"].Value = insideUrl; } } } } }
public PageDownloadedEventArgs(PageLevel downloadPage) { this.DownloadedPage = downloadPage; }