Ejemplo n.º 1
0
        /// <summary>
        /// If this method is called, then it assumes some pre-logic for links to avoid has already
        /// been applied and that the <paramref name="page"/> should be stored for future crawling.
        /// </summary>
        /// <param name="page"></param>
        public void Add(PageToCrawl page)
        {
            if (page == null)
            {
                throw new ArgumentNullException("page");
            }

            //_logger.DebugFormat("Add(page): Target: {0}, Source: {1}, Root: {2}",
            //    page.Uri.AbsoluteUri,
            //    page.ParentUri.AbsoluteUri,
            //    page.IsRoot);

            page.PageBag.SessionId = SessionId;
            page.PageBag.CrawlerId = CrawlerId;
            using (var factory = _provider.GetInstanceOf <IModelFactory>())
            {
                var link = factory.ConvertToLinkToCrawl(page, SessionId);
                AddLinkToCrawl(link);
            }
        }
Ejemplo n.º 2
0
        public void ProcessLinks(Abot.Poco.CrawledPage page)
        {
            if (page.ParsedLinks == null || page.ParsedLinks.Count() == 0)
            {
                _logger.DebugFormat("CrawledPage contained 0 parsed links");
                LinksToCrawl  = new List <LinkToCrawl>();
                LinksToByPass = new List <CrawledLink>();
                return;
            }

            LinksToByPass     = new List <CrawledLink>();
            MapOfLinksToCrawl = new Dictionary <string, LinkToCrawl>();

            using (var factory = _provider.GetInstanceOf <IModelFactory>())
            {
                var         sessionId    = page.PageBag.SessionId;
                var         crawlerId    = page.PageBag.CrawlerId;
                LinkToCrawl link         = null;
                CrawledLink bypassedLink = null;
                foreach (var targetUri in page.ParsedLinks)
                {
                    ProcessLink(page, factory, targetUri, sessionId, crawlerId);
                }

                LinksToCrawl = MapOfLinksToCrawl.Values.ToList();
                MapOfLinksToCrawl.Clear();
                MapOfLinksToCrawl = null;
                if (_logger.IsDebugEnabled)
                {
                    _logger.DebugFormat("TargetUrls of new LinksToCrawl: {0}",
                                        String.Join("; ", LinksToCrawl.Select(o => o.TargetUrl)));
                    _logger.DebugFormat("TargetUrls of new LinksToByPass: {0}",
                                        String.Join("; ", LinksToByPass.Select(o => o.TargetUrl)));
                }
            }
        }
Ejemplo n.º 3
0
        private void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage        = e.CrawledPage;
            bool        externalLinksFound = false;

            _logger.DebugFormat("Page Crawl Completed {0}; Status {1}; Source URL: {2}; CrawlerId: {3}; SessionId: {4}",
                                crawledPage.Uri.AbsoluteUri,
                                crawledPage.HttpWebResponse.StatusCode,
                                crawledPage.ParentUri.AbsoluteUri,
                                crawledPage.PageBag.CrawlerId,
                                crawledPage.PageBag.SessionId);

            //----------------------------------------
            // create and store the crawled link
            var crawledLink = new CrawledLink();

            crawledLink.SessionId  = crawledPage.PageBag.SessionId;
            crawledLink.CrawlerId  = crawledPage.PageBag.CrawlerId;
            crawledLink.SourceUrl  = crawledPage.ParentUri.AbsoluteUri;
            crawledLink.TargetUrl  = crawledPage.Uri.AbsoluteUri; // what was crawled
            crawledLink.StatusCode = crawledPage.HttpWebResponse.StatusCode;
            crawledLink.IsRoot     = crawledPage.IsRoot;
            crawledLink.CrawlDepth = crawledPage.CrawlDepth;

            //------------

            if (crawledPage.WebException != null)
            {
                // store error information if it occurred
                crawledLink.ErrorOccurred = true;
                crawledLink.Exception     = crawledPage.WebException.Message; //TODO store more data of the exception

                _logger.Error(string.Format("A WebException occurred for Target Url: {0}; Source URL: {1}; CrawlerId: {2}; SessionId: {3}",
                                            crawledLink.TargetUrl, crawledLink.SourceUrl, crawledLink.CrawlerId, crawledLink.SessionId),
                              crawledPage.WebException);
            }
            _scheduler.RecordCrawledLink(crawledLink);

            //----------------------------------------
            // Check if the page should be processed, if true process it
            //  - extract the title, keywords, description, cookies, etc from the page
            //    and save processed data.
            if (crawledPage.WebException == null)
            {
                if (IsPageToBeProcessed(crawledPage.Uri, crawledPage.HttpWebResponse.StatusCode))
                {
                    using (var processor = _provider.GetInstanceOf <ICrawledPageProcessor>())
                    {
                        var result = processor.ProcessPage(crawledPage);
                        _repo.AddProcessedPage(result);
                    }
                }

                externalLinksFound = _scheduler.ProcessParsedLinks(crawledPage);
                if (externalLinksFound)
                {
                    OnExternalLinksFound(CrawlerId, crawledPage.Uri);
                }
            }

            string mssg = null;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                mssg = string.Format("Crawl of page failed {0}; source: {1}", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri.AbsoluteUri);
                _logger.Error(mssg);
            }
            else
            {
                mssg = string.Format("Crawl of page succeeded {0}; source: {1}", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri.AbsoluteUri);
                _logger.Debug(mssg);
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                mssg = string.Format("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
                _logger.Error(mssg);
            }

            //------------

            OnLinkCrawlCompleted(CrawlerDefinition,
                                 crawledPage.ParentUri.AbsoluteUri,
                                 crawledPage.Uri.AbsoluteUri,
                                 crawledPage.HttpWebResponse.StatusCode,
                                 crawledPage.WebException != null,
                                 externalLinksFound);
        }