Exemplo n.º 1
0
        /// <summary>
        /// Processes the Uri specified by <paramref name="targetUri"/> as a potential link to be crawled,
        /// bypassed, or ignored.
        /// </summary>
        /// <param name="page">The CrawledPage from which the targetUri was parsed.</param>
        /// <param name="factory">An instance of IModelFactory</param>
        /// <param name="targetUri">The target Uri being processed</param>
        internal void ProcessLink(Abot.Poco.CrawledPage page, IModelFactory factory, Uri targetUri, int sessionId, int crawlerId)
        {
            CrawledLink bypassedLink = null;

            if (targetUri.Scheme == Uri.UriSchemeMailto)
            {
                // Mailto schema: bypass
                bypassedLink            = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId);
                bypassedLink.IsRoot     = false;
                bypassedLink.CrawlDepth = page.CrawlDepth + 1;
                bypassedLink.StatusCode = HttpStatusCode.OK;
                bypassedLink.Bypassed   = true;
                LinksToByPass.Add(bypassedLink);
            }
            else if (string.Compare(page.Uri.AbsoluteUri, targetUri.AbsoluteUri) == 0)
            {
                // Exact self loops: bypass
                bypassedLink            = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId);
                bypassedLink.IsRoot     = false;
                bypassedLink.CrawlDepth = page.CrawlDepth + 1;
                bypassedLink.StatusCode = HttpStatusCode.OK;
                bypassedLink.Bypassed   = true;
                LinksToByPass.Add(bypassedLink);
            }
            else if (MapOfLinksToCrawl.ContainsKey(targetUri.AbsoluteUri))
            {
                // Duplicates: bypass
                bypassedLink            = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId);
                bypassedLink.IsRoot     = false;
                bypassedLink.CrawlDepth = page.CrawlDepth + 1;
                bypassedLink.StatusCode = HttpStatusCode.OK;
                bypassedLink.Bypassed   = true;
                LinksToByPass.Add(bypassedLink);
            }
            else
            {
                // process link to be crawled that was parsed from a crawled page, so
                // it will not be a root.
                var link = factory.CreateLinkToCrawl(page, targetUri, sessionId);
                MapOfLinksToCrawl.Add(targetUri.AbsoluteUri, link);

                if (string.Compare(page.Uri.GetBaseDomain(), targetUri.GetBaseDomain(), true) != 0)
                {
                    ExternalLinksFound |= true;
                }
            }
        }
Exemplo n.º 2
0
        /// <summary>
        /// Processes the Uri specified by <paramref name="targetUri"/> as a potential link to be crawled,
        /// bypassed, or ignored.
        /// </summary>
        /// <param name="page">The CrawledPage from which the targetUri was parsed.</param>
        /// <param name="factory">An instance of IModelFactory</param>
        /// <param name="targetUri">The target Uri being processed</param>
        internal void ProcessLink(Abot.Poco.CrawledPage page, IModelFactory factory, Uri targetUri, int sessionId, int crawlerId)
        {
            CrawledLink bypassedLink = null;

            if (targetUri.Scheme == Uri.UriSchemeMailto)
            {
                // Mailto schema: bypass
                bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId);
                bypassedLink.IsRoot = false;
                bypassedLink.CrawlDepth = page.CrawlDepth + 1;
                bypassedLink.StatusCode = HttpStatusCode.OK;
                bypassedLink.Bypassed = true;
                LinksToByPass.Add(bypassedLink);
            }
            else if (string.Compare(page.Uri.AbsoluteUri, targetUri.AbsoluteUri) == 0)
            {
                // Exact self loops: bypass
                bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId);
                bypassedLink.IsRoot = false;
                bypassedLink.CrawlDepth = page.CrawlDepth + 1;
                bypassedLink.StatusCode = HttpStatusCode.OK;
                bypassedLink.Bypassed = true;
                LinksToByPass.Add(bypassedLink);
            }
            else if (MapOfLinksToCrawl.ContainsKey(targetUri.AbsoluteUri))
            {
                // Duplicates: bypass
                bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId);
                bypassedLink.IsRoot = false;
                bypassedLink.CrawlDepth = page.CrawlDepth + 1;
                bypassedLink.StatusCode = HttpStatusCode.OK;
                bypassedLink.Bypassed = true;
                LinksToByPass.Add(bypassedLink);
            }
            else
            {
                // process link to be crawled that was parsed from a crawled page, so
                // it will not be a root.
                var link = factory.CreateLinkToCrawl(page, targetUri, sessionId);
                MapOfLinksToCrawl.Add(targetUri.AbsoluteUri, link);

                if (string.Compare(page.Uri.GetBaseDomain(), targetUri.GetBaseDomain(), true) != 0)
                    ExternalLinksFound |= true;
            }
        }