public PageLinksCrawlDisallowedArgs(CrawlContext crawlContext, CrawledPage crawledPage, string disallowedReason) : base(crawlContext, crawledPage) { if (string.IsNullOrWhiteSpace(disallowedReason)) throw new ArgumentNullException("disallowedReason"); DisallowedReason = disallowedReason; }
public PageCrawlCompletedArgs(CrawlContext crawlContext, CrawledPage crawledPage) : base(crawlContext) { if (crawledPage == null) throw new ArgumentNullException("crawledPage"); CrawledPage = crawledPage; }
protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage) { IEnumerable<string> hrefValues = crawledPage.CsQueryDocument.Select("a, area") .Elements .Select(y => y.GetAttribute("href")) .Where(a => !string.IsNullOrWhiteSpace(a)); return hrefValues; }
protected override string GetBaseHrefValue(CrawledPage crawledPage) { string hrefValue = ""; HtmlNode node = crawledPage.HtmlDocument.DocumentNode.SelectSingleNode("//base"); //Must use node.InnerHtml instead of node.InnerText since "aaa<br />bbb" will be returned as "aaabbb" if (node != null) hrefValue = node.GetAttributeValue("href", "").Trim(); return hrefValue; }
protected override IEnumerable<string> GetHrefValues(CrawledPage crawledPage) { List<string> hrefValues = new List<string>(); HtmlNodeCollection aTags = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//a[@href]"); HtmlNodeCollection areaTags = crawledPage.HtmlDocument.DocumentNode.SelectNodes("//area[@href]"); hrefValues.AddRange(GetLinks(aTags)); hrefValues.AddRange(GetLinks(areaTags)); return hrefValues; }
/// <summary> /// Make an http web request to the url and download its content based on the param func decision /// </summary> public virtual CrawledPage MakeRequest(Uri uri, Func<CrawledPage, CrawlDecision> shouldDownloadContent) { if (uri == null) throw new ArgumentNullException("uri"); CrawledPage crawledPage = new CrawledPage(uri); HttpWebRequest request = null; HttpWebResponse response = null; try { request = BuildRequestObject(uri); response = (HttpWebResponse)request.GetResponse(); } catch (WebException e) { crawledPage.WebException = e; if (e.Response != null) response = (HttpWebResponse)e.Response; _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri); _logger.Debug(e); } catch (Exception e) { _logger.DebugFormat("Error occurred requesting url [{0}]", uri.AbsoluteUri); _logger.Debug(e); } finally { crawledPage.HttpWebRequest = request; if (response != null) { crawledPage.HttpWebResponse = response; CrawlDecision shouldDownloadContentDecision = shouldDownloadContent(crawledPage); if (shouldDownloadContentDecision.Allow) { crawledPage.RawContent = GetRawHtml(response, uri); crawledPage.PageSizeInBytes = Encoding.UTF8.GetBytes(crawledPage.RawContent).Length; } else { _logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldDownloadContentDecision.Reason); } response.Close(); } } return crawledPage; }
/// <summary> /// Parses html to extract hyperlinks, converts each into an absolute url /// </summary> public virtual IEnumerable<Uri> GetLinks(CrawledPage crawledPage) { CheckParams(crawledPage); Stopwatch timer = Stopwatch.StartNew(); List<Uri> uris = GetUris(crawledPage, GetHrefValues(crawledPage)); timer.Stop(); _logger.DebugFormat("{0} parsed links from [{1}] in [{2}] milliseconds", ParserType, crawledPage.Uri, timer.ElapsedMilliseconds); return uris; }
public virtual CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) return new CrawlDecision { Allow = false, Reason = "Null crawled page" }; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if (string.IsNullOrWhiteSpace(crawledPage.RawContent)) return new CrawlDecision { Allow = false, Reason = "Page has no content" }; if (!crawlContext.CrawlConfiguration.IsExternalPageLinksCrawlingEnabled && !crawledPage.IsInternal) return new CrawlDecision { Allow = false, Reason = "Link is external" }; if (crawledPage.CrawlDepth >= crawlContext.CrawlConfiguration.MaxCrawlDepth) return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" }; return new CrawlDecision { Allow = true }; }
protected virtual List<Uri> GetUris(CrawledPage crawledPage, IEnumerable<string> hrefValues) { List<Uri> uris = new List<Uri>(); if (hrefValues == null || hrefValues.Count() < 1) return uris; //Use the uri of the page that actually responded to the request instead of crawledPage.Uri (Issue 82). //Using HttpWebRequest.Address instead of HttpWebResonse.ResponseUri since this is the best practice and mentioned on http://msdn.microsoft.com/en-us/library/system.net.httpwebresponse.responseuri.aspx Uri uriToUse = crawledPage.HttpWebRequest.Address ?? crawledPage.Uri; //If html base tag exists use it instead of page uri for relative links string baseHref = GetBaseHrefValue(crawledPage); if (!string.IsNullOrEmpty(baseHref)) { try { uriToUse = new Uri(baseHref); } catch { } } string href = ""; foreach (string hrefValue in hrefValues) { try { href = hrefValue.Split('#')[0]; Uri newUri = new Uri(uriToUse, href); if (!uris.Contains(newUri)) uris.Add(newUri); } catch (Exception e) { _logger.DebugFormat("Could not parse link [{0}] on page [{1}]", hrefValue, crawledPage.Uri); _logger.Debug(e); } } return uris; }
protected virtual bool ShouldCrawlPageLinks(CrawledPage crawledPage) { CrawlDecision shouldCrawlPageLinksDecision = _crawlDecisionMaker.ShouldCrawlPageLinks(crawledPage, _crawlContext); if (shouldCrawlPageLinksDecision.Allow) shouldCrawlPageLinksDecision = (_shouldCrawlPageLinksDecisionMaker != null) ? _shouldCrawlPageLinksDecisionMaker.Invoke(crawledPage, _crawlContext) : new CrawlDecision { Allow = true }; if (!shouldCrawlPageLinksDecision.Allow) { _logger.DebugFormat("Links on page [{0}] not crawled, [{1}]", crawledPage.Uri.AbsoluteUri, shouldCrawlPageLinksDecision.Reason); FirePageLinksCrawlDisallowedEventAsync(crawledPage, shouldCrawlPageLinksDecision.Reason); FirePageLinksCrawlDisallowedEvent(crawledPage, shouldCrawlPageLinksDecision.Reason); } return shouldCrawlPageLinksDecision.Allow; }
protected virtual CrawlDecision ShouldDownloadPageContentWrapper(CrawledPage crawledPage) { CrawlDecision decision = _crawlDecisionMaker.ShouldDownloadPageContent(crawledPage, _crawlContext); if (decision.Allow) decision = (_shouldDownloadPageContentDecisionMaker != null) ? _shouldDownloadPageContentDecisionMaker.Invoke(crawledPage, _crawlContext) : new CrawlDecision { Allow = true }; return decision; }
protected virtual bool PageSizeIsAboveMax(CrawledPage crawledPage) { bool isAboveMax = false; if (_crawlContext.CrawlConfiguration.MaxPageSizeInBytes > 0 && crawledPage.PageSizeInBytes > _crawlContext.CrawlConfiguration.MaxPageSizeInBytes) { isAboveMax = true; _logger.DebugFormat("Page [{0}] has a page size of [{1}] bytes which is above the [{2}] byte max", crawledPage.Uri, crawledPage.PageSizeInBytes, _crawlContext.CrawlConfiguration.MaxPageSizeInBytes); } return isAboveMax; }
protected virtual void SchedulePageLinks(CrawledPage crawledPage) { IEnumerable<Uri> crawledPageLinks = _hyperLinkParser.GetLinks(crawledPage); foreach (Uri uri in crawledPageLinks) { //Added due to a bug in the Uri class related to this (http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed) try { PageToCrawl page = new CrawledPage(uri); page.ParentUri = crawledPage.Uri; page.CrawlDepth = crawledPage.CrawlDepth + 1; page.IsInternal = _isInternalDecisionMaker(uri, _crawlContext.RootUri); page.IsRoot = false; _scheduler.Add(page); } catch { } } }
protected virtual void FirePageLinksCrawlDisallowedEvent(CrawledPage crawledPage, string reason) { try { EventHandler<PageLinksCrawlDisallowedArgs> threadSafeEvent = PageLinksCrawlDisallowed; if (threadSafeEvent != null) threadSafeEvent(this, new PageLinksCrawlDisallowedArgs(_crawlContext, crawledPage, reason)); } catch (Exception e) { _logger.Error("An unhandled exception was thrown by a subscriber of the PageLinksCrawlDisallowed event for url:" + crawledPage.Uri.AbsoluteUri); _logger.Error(e); } }
protected virtual void FirePageLinksCrawlDisallowedEventAsync(CrawledPage crawledPage, string reason) { EventHandler<PageLinksCrawlDisallowedArgs> threadSafeEvent = PageLinksCrawlDisallowedAsync; if (threadSafeEvent != null) { //Fire each subscribers delegate async foreach (EventHandler<PageLinksCrawlDisallowedArgs> del in threadSafeEvent.GetInvocationList()) { del.BeginInvoke(this, new PageLinksCrawlDisallowedArgs(_crawlContext, crawledPage, reason), null, null); } } }
protected virtual void CheckParams(CrawledPage crawledPage) { if (crawledPage == null) throw new ArgumentNullException("crawledPage"); }
protected override string GetBaseHrefValue(CrawledPage crawledPage) { string baseTagValue = crawledPage.CsQueryDocument.Select("base").Attr("href") ?? ""; return baseTagValue.Trim(); }
public virtual CrawlDecision ShouldDownloadPageContent(CrawledPage crawledPage, CrawlContext crawlContext) { if (crawledPage == null) return new CrawlDecision { Allow = false, Reason = "Null crawled page" }; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if (crawledPage.HttpWebResponse == null) return new CrawlDecision { Allow = false, Reason = "Null HttpWebResponse" }; if (crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) return new CrawlDecision { Allow = false, Reason = "HttpStatusCode is not 200" }; string pageContentType = crawledPage.HttpWebResponse.ContentType.ToLower().Trim(); bool isDownloadable = false; foreach (string downloadableContentType in crawlContext.CrawlConfiguration.DownloadableContentTypes.Split(',')) { if (pageContentType.Contains(downloadableContentType.ToLower().Trim())) { isDownloadable = true; break; } } if (!isDownloadable) return new CrawlDecision { Allow = false, Reason = "Content type is not any of the following: " + crawlContext.CrawlConfiguration.DownloadableContentTypes }; if (crawlContext.CrawlConfiguration.MaxPageSizeInBytes > 0 && crawledPage.HttpWebResponse.ContentLength > crawlContext.CrawlConfiguration.MaxPageSizeInBytes) return new CrawlDecision { Allow = false, Reason = string.Format("Page size of [{0}] bytes is above the max allowable of [{1}] bytes", crawledPage.PageSizeInBytes, crawlContext.CrawlConfiguration.MaxPageSizeInBytes) }; return new CrawlDecision { Allow = true }; }
protected abstract IEnumerable<string> GetHrefValues(CrawledPage crawledPage);
protected abstract string GetBaseHrefValue(CrawledPage crawledPage);