Exemple #1
0
        public virtual CrawledLink CreateCrawledLink(string sourceUrl, string targetUrl, int sessionId, int crawlerId)
        {
            var link = new CrawledLink();

            link.SessionId = sessionId;
            link.CrawlerId = crawlerId;
            link.SourceUrl = sourceUrl;
            link.TargetUrl = targetUrl;
            return(link);
        }
Exemple #2
0
        public Guid RecordCrawledLink(CrawledLink crawledLink)
        {
            _logger.DebugFormat("RecordCrawledLink(): TargetUrl: {0}, SourceUrl: {1}, Root: {2}",
                                crawledLink.TargetUrl,
                                crawledLink.SourceUrl,
                                crawledLink.IsRoot);

            // record the crawled link
            _repo.AddCrawledLink(crawledLink, true);

            return(crawledLink.Id);
        }
Exemple #3
0
        public void AddCrawledLink(CrawledLink link, bool removeCorrespondingLinkToCrawl)
        {
            // delete from LinksToCrawl first
            Thread.Sleep(100);
            if (removeCorrespondingLinkToCrawl)
            {
                DeleteLinkToCrawl(link.SessionId, link.SourceUrl, link.TargetUrl);
            }

            link.Id = NextId;
            CrawledLinks.Add(link.Id, link);
        }
Exemple #4
0
        public virtual CrawledLink CreateCrawledLink(CrawledPage page, int sessionId, int crawlerId)
        {
            var link = new CrawledLink();

            link.SessionId  = page.PageBag.SessionId;
            link.CrawlerId  = page.PageBag.CrawlerId;
            link.SourceUrl  = page.ParentUri.AbsoluteUri;
            link.TargetUrl  = page.Uri.AbsoluteUri; // what was crawled
            link.StatusCode = page.HttpWebResponse.StatusCode;
            link.IsRoot     = page.IsRoot;
            link.CrawlDepth = page.CrawlDepth;
            return(link);
        }
Exemple #5
0
 public void AddCrawledLink(CrawledLink link, bool removeCorrespondingLinkToCrawl)
 {
     Thread.Sleep(100);
     using (var session = _sessionFactory.OpenSession())
     {
         if (removeCorrespondingLinkToCrawl)
         {
             DeleteLinkToCrawl(link.SessionId, link.SourceUrl, link.TargetUrl);
         }
         using (var transaction = session.BeginTransaction())
         {
             session.Save(link);
             transaction.Commit();
         }
     }
 }
        /// <summary>
        /// Processes the Uri specified by <paramref name="targetUri"/> as a potential link to be crawled,
        /// bypassed, or ignored.
        /// </summary>
        /// <param name="page">The CrawledPage from which the targetUri was parsed.</param>
        /// <param name="factory">An instance of IModelFactory</param>
        /// <param name="targetUri">The target Uri being processed</param>
        internal void ProcessLink(Abot.Poco.CrawledPage page, IModelFactory factory, Uri targetUri, int sessionId, int crawlerId)
        {
            CrawledLink bypassedLink = null;

            if (targetUri.Scheme == Uri.UriSchemeMailto)
            {
                // Mailto schema: bypass
                bypassedLink            = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId);
                bypassedLink.IsRoot     = false;
                bypassedLink.CrawlDepth = page.CrawlDepth + 1;
                bypassedLink.StatusCode = HttpStatusCode.OK;
                bypassedLink.Bypassed   = true;
                LinksToByPass.Add(bypassedLink);
            }
            else if (string.Compare(page.Uri.AbsoluteUri, targetUri.AbsoluteUri) == 0)
            {
                // Exact self loops: bypass
                bypassedLink            = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId);
                bypassedLink.IsRoot     = false;
                bypassedLink.CrawlDepth = page.CrawlDepth + 1;
                bypassedLink.StatusCode = HttpStatusCode.OK;
                bypassedLink.Bypassed   = true;
                LinksToByPass.Add(bypassedLink);
            }
            else if (MapOfLinksToCrawl.ContainsKey(targetUri.AbsoluteUri))
            {
                // Duplicates: bypass
                bypassedLink            = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId);
                bypassedLink.IsRoot     = false;
                bypassedLink.CrawlDepth = page.CrawlDepth + 1;
                bypassedLink.StatusCode = HttpStatusCode.OK;
                bypassedLink.Bypassed   = true;
                LinksToByPass.Add(bypassedLink);
            }
            else
            {
                // process link to be crawled that was parsed from a crawled page, so
                // it will not be a root.
                var link = factory.CreateLinkToCrawl(page, targetUri, sessionId);
                MapOfLinksToCrawl.Add(targetUri.AbsoluteUri, link);

                if (string.Compare(page.Uri.GetBaseDomain(), targetUri.GetBaseDomain(), true) != 0)
                {
                    ExternalLinksFound |= true;
                }
            }
        }
Exemple #7
0
        public static CrawledLink GetCrawledLink(string srcUrl, string targetUrl)
        {
            var link = new CrawledLink();

            link.SessionId     = 54;
            link.CrawlerId     = 64;
            link.SourceUrl     = srcUrl;
            link.TargetUrl     = targetUrl;
            link.StatusCode    = System.Net.HttpStatusCode.Conflict;
            link.ErrorOccurred = true;
            link.Exception     = new Exception("BLAH").ToString();
            link.IsRoot        = true;
            link.Bypassed      = true;
            link.CrawlDepth    = 3;

            return(link);
        }
Exemple #8
0
        public CrawledLink GetCrawledLink(int sessionId, int crawlerId, string srcUrl, string targetUrl)
        {
            CrawledLink result = null;

            using (var session = _sessionFactory.OpenSession())
            {
                using (var transaction = session.BeginTransaction())
                {
                    var q = session.Query <CrawledLink>()
                            .Where(x => x.SessionId == sessionId)
                            .Where(x => x.CrawlerId == crawlerId)
                            .Where(x => x.SourceUrl == srcUrl)
                            .Where(x => x.TargetUrl == targetUrl);
                    result = q.FirstOrDefault();
                    transaction.Commit();
                }
            }
            return(result);
        }
        public void ProcessLinks(Abot.Poco.CrawledPage page)
        {
            if (page.ParsedLinks == null || page.ParsedLinks.Count() == 0)
            {
                _logger.DebugFormat("CrawledPage contained 0 parsed links");
                LinksToCrawl  = new List <LinkToCrawl>();
                LinksToByPass = new List <CrawledLink>();
                return;
            }

            LinksToByPass     = new List <CrawledLink>();
            MapOfLinksToCrawl = new Dictionary <string, LinkToCrawl>();

            using (var factory = _provider.GetInstanceOf <IModelFactory>())
            {
                var         sessionId    = page.PageBag.SessionId;
                var         crawlerId    = page.PageBag.CrawlerId;
                LinkToCrawl link         = null;
                CrawledLink bypassedLink = null;
                foreach (var targetUri in page.ParsedLinks)
                {
                    ProcessLink(page, factory, targetUri, sessionId, crawlerId);
                }

                LinksToCrawl = MapOfLinksToCrawl.Values.ToList();
                MapOfLinksToCrawl.Clear();
                MapOfLinksToCrawl = null;
                if (_logger.IsDebugEnabled)
                {
                    _logger.DebugFormat("TargetUrls of new LinksToCrawl: {0}",
                                        String.Join("; ", LinksToCrawl.Select(o => o.TargetUrl)));
                    _logger.DebugFormat("TargetUrls of new LinksToByPass: {0}",
                                        String.Join("; ", LinksToByPass.Select(o => o.TargetUrl)));
                }
            }
        }
Exemple #10
0
        private void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage        = e.CrawledPage;
            bool        externalLinksFound = false;

            _logger.DebugFormat("Page Crawl Completed {0}; Status {1}; Source URL: {2}; CrawlerId: {3}; SessionId: {4}",
                                crawledPage.Uri.AbsoluteUri,
                                crawledPage.HttpWebResponse.StatusCode,
                                crawledPage.ParentUri.AbsoluteUri,
                                crawledPage.PageBag.CrawlerId,
                                crawledPage.PageBag.SessionId);

            //----------------------------------------
            // create and store the crawled link
            var crawledLink = new CrawledLink();

            crawledLink.SessionId  = crawledPage.PageBag.SessionId;
            crawledLink.CrawlerId  = crawledPage.PageBag.CrawlerId;
            crawledLink.SourceUrl  = crawledPage.ParentUri.AbsoluteUri;
            crawledLink.TargetUrl  = crawledPage.Uri.AbsoluteUri; // what was crawled
            crawledLink.StatusCode = crawledPage.HttpWebResponse.StatusCode;
            crawledLink.IsRoot     = crawledPage.IsRoot;
            crawledLink.CrawlDepth = crawledPage.CrawlDepth;

            //------------

            if (crawledPage.WebException != null)
            {
                // store error information if it occurred
                crawledLink.ErrorOccurred = true;
                crawledLink.Exception     = crawledPage.WebException.Message; //TODO store more data of the exception

                _logger.Error(string.Format("A WebException occurred for Target Url: {0}; Source URL: {1}; CrawlerId: {2}; SessionId: {3}",
                                            crawledLink.TargetUrl, crawledLink.SourceUrl, crawledLink.CrawlerId, crawledLink.SessionId),
                              crawledPage.WebException);
            }
            _scheduler.RecordCrawledLink(crawledLink);

            //----------------------------------------
            // Check if the page should be processed, if true process it
            //  - extract the title, keywords, description, cookies, etc from the page
            //    and save processed data.
            if (crawledPage.WebException == null)
            {
                if (IsPageToBeProcessed(crawledPage.Uri, crawledPage.HttpWebResponse.StatusCode))
                {
                    using (var processor = _provider.GetInstanceOf <ICrawledPageProcessor>())
                    {
                        var result = processor.ProcessPage(crawledPage);
                        _repo.AddProcessedPage(result);
                    }
                }

                externalLinksFound = _scheduler.ProcessParsedLinks(crawledPage);
                if (externalLinksFound)
                {
                    OnExternalLinksFound(CrawlerId, crawledPage.Uri);
                }
            }

            string mssg = null;

            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                mssg = string.Format("Crawl of page failed {0}; source: {1}", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri.AbsoluteUri);
                _logger.Error(mssg);
            }
            else
            {
                mssg = string.Format("Crawl of page succeeded {0}; source: {1}", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri.AbsoluteUri);
                _logger.Debug(mssg);
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                mssg = string.Format("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
                _logger.Error(mssg);
            }

            //------------

            OnLinkCrawlCompleted(CrawlerDefinition,
                                 crawledPage.ParentUri.AbsoluteUri,
                                 crawledPage.Uri.AbsoluteUri,
                                 crawledPage.HttpWebResponse.StatusCode,
                                 crawledPage.WebException != null,
                                 externalLinksFound);
        }