Ejemplo n.º 1
0
 public virtual CrawledLink CreateCrawledLink(string sourceUrl, string targetUrl, int sessionId, int crawlerId)
 {
     var link = new CrawledLink();
     link.SessionId = sessionId;
     link.CrawlerId = crawlerId;
     link.SourceUrl = sourceUrl;
     link.TargetUrl = targetUrl;
     return link;
 }
Ejemplo n.º 2
0
 public virtual CrawledLink CreateCrawledLink(CrawledPage page, int sessionId, int crawlerId)
 {
     var link = new CrawledLink();
     link.SessionId = page.PageBag.SessionId;
     link.CrawlerId = page.PageBag.CrawlerId;
     link.SourceUrl = page.ParentUri.AbsoluteUri;
     link.TargetUrl = page.Uri.AbsoluteUri; // what was crawled
     link.StatusCode = page.HttpWebResponse.StatusCode;
     link.IsRoot = page.IsRoot;
     link.CrawlDepth = page.CrawlDepth;
     return link;
 }
Ejemplo n.º 3
0
        public static CrawledLink GetCrawledLink(string srcUrl, string targetUrl)
        {
            var link = new CrawledLink();
            link.SessionId = 54;
            link.CrawlerId = 64;
            link.SourceUrl = srcUrl;
            link.TargetUrl = targetUrl;
            link.StatusCode = System.Net.HttpStatusCode.Conflict;
            link.ErrorOccurred = true;
            link.Exception = new Exception("BLAH").ToString();
            link.IsRoot = true;
            link.Bypassed = true;
            link.CrawlDepth = 3;

            return link;
        }
Ejemplo n.º 4
0
 public void AddCrawledLink(CrawledLink link, bool removeCorrespondingLinkToCrawl)
 {
     Thread.Sleep(100);
     using (var session = _sessionFactory.OpenSession())
     {
         if (removeCorrespondingLinkToCrawl)
             DeleteLinkToCrawl(link.SessionId, link.SourceUrl, link.TargetUrl);
         using (var transaction = session.BeginTransaction())
         {
             session.Save(link);
             transaction.Commit();
         }
     }
 }
Ejemplo n.º 5
0
        public void AddCrawledLink(CrawledLink link, bool removeCorrespondingLinkToCrawl)
        {
            // delete from LinksToCrawl first
            Thread.Sleep(100);
            if (removeCorrespondingLinkToCrawl)
                DeleteLinkToCrawl(link.SessionId, link.SourceUrl, link.TargetUrl);

            link.Id = NextId;
            CrawledLinks.Add(link.Id, link);
        }
Ejemplo n.º 6
0
        private void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            CrawledPage crawledPage = e.CrawledPage;
            bool externalLinksFound = false;
            _logger.DebugFormat("Page Crawl Completed {0}; Status {1}; Source URL: {2}; CrawlerId: {3}; SessionId: {4}",
                                crawledPage.Uri.AbsoluteUri,
                                crawledPage.HttpWebResponse.StatusCode,
                                crawledPage.ParentUri.AbsoluteUri,
                                crawledPage.PageBag.CrawlerId,
                                crawledPage.PageBag.SessionId);

            //----------------------------------------
            // create and store the crawled link
            var crawledLink = new CrawledLink();
            crawledLink.SessionId = crawledPage.PageBag.SessionId;
            crawledLink.CrawlerId = crawledPage.PageBag.CrawlerId;
            crawledLink.SourceUrl = crawledPage.ParentUri.AbsoluteUri;
            crawledLink.TargetUrl = crawledPage.Uri.AbsoluteUri; // what was crawled
            crawledLink.StatusCode = crawledPage.HttpWebResponse.StatusCode;
            crawledLink.IsRoot = crawledPage.IsRoot;
            crawledLink.CrawlDepth = crawledPage.CrawlDepth;

            //------------

            if (crawledPage.WebException != null)
            {
                // store error information if it occurred
                crawledLink.ErrorOccurred = true;
                crawledLink.Exception = crawledPage.WebException.Message; //TODO store more data of the exception

                _logger.Error(string.Format("A WebException occurred for Target Url: {0}; Source URL: {1}; CrawlerId: {2}; SessionId: {3}",
                                crawledLink.TargetUrl, crawledLink.SourceUrl, crawledLink.CrawlerId, crawledLink.SessionId),
                              crawledPage.WebException);
            }
            _scheduler.RecordCrawledLink(crawledLink);

            //----------------------------------------
            // Check if the page should be processed, if true process it 
            //  - extract the title, keywords, description, cookies, etc from the page
            //    and save processed data.
            if (crawledPage.WebException == null)
            {       
                if (IsPageToBeProcessed(crawledPage.Uri, crawledPage.HttpWebResponse.StatusCode))
                {
                    using (var processor = _provider.GetInstanceOf<ICrawledPageProcessor>())
                    {
                        var result = processor.ProcessPage(crawledPage);
                        _repo.AddProcessedPage(result);
                    }
                }

                externalLinksFound = _scheduler.ProcessParsedLinks(crawledPage);
                if (externalLinksFound)
                {
                    OnExternalLinksFound(CrawlerId, crawledPage.Uri);
                }
            }

            string mssg = null;
            if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK)
            {
                mssg = string.Format("Crawl of page failed {0}; source: {1}", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri.AbsoluteUri);
                _logger.Error(mssg);
            }
            else
            {
                mssg = string.Format("Crawl of page succeeded {0}; source: {1}", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri.AbsoluteUri);
                _logger.Debug(mssg);
            }

            if (string.IsNullOrEmpty(crawledPage.Content.Text))
            {
                mssg = string.Format("Page had no content {0}", crawledPage.Uri.AbsoluteUri);
                _logger.Error(mssg);
            }

            //------------

            OnLinkCrawlCompleted(CrawlerDefinition, 
                                 crawledPage.ParentUri.AbsoluteUri, 
                                 crawledPage.Uri.AbsoluteUri, 
                                 crawledPage.HttpWebResponse.StatusCode,
                                 crawledPage.WebException != null,
                                 externalLinksFound);
        }
Ejemplo n.º 7
0
        public Guid RecordCrawledLink(CrawledLink crawledLink)
        {
            _logger.DebugFormat("RecordCrawledLink(): TargetUrl: {0}, SourceUrl: {1}, Root: {2}",
                                crawledLink.TargetUrl,
                                crawledLink.SourceUrl,
                                crawledLink.IsRoot);

            // record the crawled link
            _repo.AddCrawledLink(crawledLink, true);

            return crawledLink.Id;
        }