/// <summary> /// Processes the Uri specified by <paramref name="targetUri"/> as a potential link to be crawled, /// bypassed, or ignored. /// </summary> /// <param name="page">The CrawledPage from which the targetUri was parsed.</param> /// <param name="factory">An instance of IModelFactory</param> /// <param name="targetUri">The target Uri being processed</param> internal void ProcessLink(Abot.Poco.CrawledPage page, IModelFactory factory, Uri targetUri, int sessionId, int crawlerId) { CrawledLink bypassedLink = null; if (targetUri.Scheme == Uri.UriSchemeMailto) { // Mailto schema: bypass bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId); bypassedLink.IsRoot = false; bypassedLink.CrawlDepth = page.CrawlDepth + 1; bypassedLink.StatusCode = HttpStatusCode.OK; bypassedLink.Bypassed = true; LinksToByPass.Add(bypassedLink); } else if (string.Compare(page.Uri.AbsoluteUri, targetUri.AbsoluteUri) == 0) { // Exact self loops: bypass bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId); bypassedLink.IsRoot = false; bypassedLink.CrawlDepth = page.CrawlDepth + 1; bypassedLink.StatusCode = HttpStatusCode.OK; bypassedLink.Bypassed = true; LinksToByPass.Add(bypassedLink); } else if (MapOfLinksToCrawl.ContainsKey(targetUri.AbsoluteUri)) { // Duplicates: bypass bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId); bypassedLink.IsRoot = false; bypassedLink.CrawlDepth = page.CrawlDepth + 1; bypassedLink.StatusCode = HttpStatusCode.OK; bypassedLink.Bypassed = true; LinksToByPass.Add(bypassedLink); } else { // process link to be crawled that was parsed from a crawled page, so // it will not be a root. var link = factory.CreateLinkToCrawl(page, targetUri, sessionId); MapOfLinksToCrawl.Add(targetUri.AbsoluteUri, link); if (string.Compare(page.Uri.GetBaseDomain(), targetUri.GetBaseDomain(), true) != 0) { ExternalLinksFound |= true; } } }
/// <summary> /// Processes the Uri specified by <paramref name="targetUri"/> as a potential link to be crawled, /// bypassed, or ignored. /// </summary> /// <param name="page">The CrawledPage from which the targetUri was parsed.</param> /// <param name="factory">An instance of IModelFactory</param> /// <param name="targetUri">The target Uri being processed</param> internal void ProcessLink(Abot.Poco.CrawledPage page, IModelFactory factory, Uri targetUri, int sessionId, int crawlerId) { CrawledLink bypassedLink = null; if (targetUri.Scheme == Uri.UriSchemeMailto) { // Mailto schema: bypass bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId); bypassedLink.IsRoot = false; bypassedLink.CrawlDepth = page.CrawlDepth + 1; bypassedLink.StatusCode = HttpStatusCode.OK; bypassedLink.Bypassed = true; LinksToByPass.Add(bypassedLink); } else if (string.Compare(page.Uri.AbsoluteUri, targetUri.AbsoluteUri) == 0) { // Exact self loops: bypass bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId); bypassedLink.IsRoot = false; bypassedLink.CrawlDepth = page.CrawlDepth + 1; bypassedLink.StatusCode = HttpStatusCode.OK; bypassedLink.Bypassed = true; LinksToByPass.Add(bypassedLink); } else if (MapOfLinksToCrawl.ContainsKey(targetUri.AbsoluteUri)) { // Duplicates: bypass bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId); bypassedLink.IsRoot = false; bypassedLink.CrawlDepth = page.CrawlDepth + 1; bypassedLink.StatusCode = HttpStatusCode.OK; bypassedLink.Bypassed = true; LinksToByPass.Add(bypassedLink); } else { // process link to be crawled that was parsed from a crawled page, so // it will not be a root. var link = factory.CreateLinkToCrawl(page, targetUri, sessionId); MapOfLinksToCrawl.Add(targetUri.AbsoluteUri, link); if (string.Compare(page.Uri.GetBaseDomain(), targetUri.GetBaseDomain(), true) != 0) ExternalLinksFound |= true; } }