示例#1
0
        /// <summary>
        /// Processes a crawled page and returns a ProcessedPage object
        /// which can be stored.
        /// </summary>
        /// <param name="page">The results of a crawled url</param>
        /// <returns>ProcessedPage or null</returns>
        public ProcessedPage ProcessPage(Abot.Poco.CrawledPage page)
        {
            //TODO extract data
            var processed = new ProcessedPage();
            processed.SessionId = page.PageBag.SessionId;
            processed.CrawlerId = page.PageBag.CrawlerId;
            processed.PageUrl = page.Uri.AbsoluteUri;
            processed.StatusCode = page.HttpWebResponse.StatusCode;

            //TODO store cookies
            var cookies = page.HttpWebResponse.Cookies;

            return processed;
        }
示例#2
0
        public void ProcessLinks(Abot.Poco.CrawledPage page)
        {
            if (page.ParsedLinks == null || page.ParsedLinks.Count() == 0)
            {
                _logger.DebugFormat("CrawledPage contained 0 parsed links");
                LinksToCrawl = new List<LinkToCrawl>();
                LinksToByPass = new List<CrawledLink>();
                return;
            }

            LinksToByPass = new List<CrawledLink>();
            MapOfLinksToCrawl = new Dictionary<string, LinkToCrawl>();

            using (var factory = _provider.GetInstanceOf<IModelFactory>())
            {
                var sessionId = page.PageBag.SessionId;
                var crawlerId = page.PageBag.CrawlerId;
                LinkToCrawl link = null;
                CrawledLink bypassedLink = null;
                foreach (var targetUri in page.ParsedLinks)
                {
                    ProcessLink(page, factory, targetUri, sessionId, crawlerId);
                }

                LinksToCrawl = MapOfLinksToCrawl.Values.ToList();
                MapOfLinksToCrawl.Clear();
                MapOfLinksToCrawl = null;
                if (_logger.IsDebugEnabled)
                {
                    _logger.DebugFormat("TargetUrls of new LinksToCrawl: {0}",
                                        String.Join("; ", LinksToCrawl.Select(o => o.TargetUrl)));
                    _logger.DebugFormat("TargetUrls of new LinksToByPass: {0}",
                                        String.Join("; ", LinksToByPass.Select(o => o.TargetUrl)));
                }
            }
        }
示例#3
0
        /// <summary>
        /// Processes the Uri specified by <paramref name="targetUri"/> as a potential link to be crawled,
        /// bypassed, or ignored.
        /// </summary>
        /// <param name="page">The CrawledPage from which the targetUri was parsed.</param>
        /// <param name="factory">An instance of IModelFactory</param>
        /// <param name="targetUri">The target Uri being processed</param>
        internal void ProcessLink(Abot.Poco.CrawledPage page, IModelFactory factory, Uri targetUri, int sessionId, int crawlerId)
        {
            CrawledLink bypassedLink = null;

            if (targetUri.Scheme == Uri.UriSchemeMailto)
            {
                // Mailto schema: bypass
                bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId);
                bypassedLink.IsRoot = false;
                bypassedLink.CrawlDepth = page.CrawlDepth + 1;
                bypassedLink.StatusCode = HttpStatusCode.OK;
                bypassedLink.Bypassed = true;
                LinksToByPass.Add(bypassedLink);
            }
            else if (string.Compare(page.Uri.AbsoluteUri, targetUri.AbsoluteUri) == 0)
            {
                // Exact self loops: bypass
                bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId);
                bypassedLink.IsRoot = false;
                bypassedLink.CrawlDepth = page.CrawlDepth + 1;
                bypassedLink.StatusCode = HttpStatusCode.OK;
                bypassedLink.Bypassed = true;
                LinksToByPass.Add(bypassedLink);
            }
            else if (MapOfLinksToCrawl.ContainsKey(targetUri.AbsoluteUri))
            {
                // Duplicates: bypass
                bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId);
                bypassedLink.IsRoot = false;
                bypassedLink.CrawlDepth = page.CrawlDepth + 1;
                bypassedLink.StatusCode = HttpStatusCode.OK;
                bypassedLink.Bypassed = true;
                LinksToByPass.Add(bypassedLink);
            }
            else
            {
                // process link to be crawled that was parsed from a crawled page, so
                // it will not be a root.
                var link = factory.CreateLinkToCrawl(page, targetUri, sessionId);
                MapOfLinksToCrawl.Add(targetUri.AbsoluteUri, link);

                if (string.Compare(page.Uri.GetBaseDomain(), targetUri.GetBaseDomain(), true) != 0)
                    ExternalLinksFound |= true;
            }
        }