protected virtual void SchedulePageLinks(CrawledPage crawledPage) { IEnumerable <Uri> crawledPageLinks = _hyperLinkParser.GetLinks(crawledPage); foreach (Uri uri in crawledPageLinks) { //Added due to a bug in the Uri class related to this (http://stackoverflow.com/questions/2814951/system-uriformatexception-invalid-uri-the-hostname-could-not-be-parsed) try { PageToCrawl page = new CrawledPage(uri); page.ParentUri = crawledPage.Uri; page.CrawlDepth = crawledPage.CrawlDepth + 1; page.IsInternal = _isInternalDecisionMaker(uri, _crawlContext.RootUri); page.IsRoot = false; _scheduler.Add(page); } catch {} } }
protected virtual void ParsePageLinks(CrawledPage crawledPage) { crawledPage.ParsedLinks = _hyperLinkParser.GetLinks(crawledPage); }