Example #1
0
 public virtual PageToCrawl ConvertToPageToCrawl(LinkToCrawl link, int crawlerId)
 {
     var page = new PageToCrawl(new Uri(link.TargetUrl));
     page.PageBag.SessionId = link.SessionId;
     page.PageBag.CrawlerId = crawlerId;
     page.ParentUri = new Uri(link.SourceUrl);
     page.CrawlDepth = link.CrawlDepth;
     page.IsInternal = link.IsInternal;
     page.IsRoot = link.IsRoot;
     return page;
 }
Example #2
0
 public virtual LinkToCrawl ConvertToLinkToCrawl(PageToCrawl page, int sessionId)
 {
     var link = new LinkToCrawl();
     link.SessionId = sessionId;
     link.SourceUrl = page.ParentUri.AbsoluteUri;
     link.TargetUrl = page.Uri.AbsoluteUri;
     link.TargetBaseDomain = page.Uri.GetBaseDomain();
     link.CrawlDepth = page.CrawlDepth;
     link.IsRoot = page.IsRoot;
     link.IsInternal = page.IsInternal;
     return link;
 }
Example #3
0
        public static LinkToCrawl GetLinkToCrawl(string srcUrl, string targetUrl)
        {
            var link = new LinkToCrawl();
            link.SessionId = 54;
            link.InProgress = true;
            link.SourceUrl = srcUrl;
            link.TargetUrl = targetUrl;
            link.TargetBaseDomain = "LL.Com";
            link.CrawlDepth = 3;
            link.IsRoot = true;
            link.IsInternal = true;

            return link;
        }
Example #4
0
 public virtual LinkToCrawl CreateLinkToCrawl(CrawledPage page, Uri targetUri, int sessionId)
 {
     var link = new LinkToCrawl();
     link.SessionId = sessionId;
     // this was the link that was just crawled to produce the CrawledPage
     link.SourceUrl = page.Uri.AbsoluteUri;
     // this is the link parsed that must be scheduled
     link.TargetUrl = targetUri.AbsoluteUri;
     link.TargetBaseDomain = targetUri.GetBaseDomain();
     // creating a link from a crawled page, so it will not be the root
     link.IsRoot = false;
     link.IsInternal = string.Compare(page.Uri.GetBaseDomain(), targetUri.GetBaseDomain(), true) == 0;
     // increasing depth is also done in the default scheduler
     link.CrawlDepth = page.CrawlDepth + 1;
     return link;
 }
Example #5
0
 public void AddLinkToCrawl(LinkToCrawl link)
 {
     using (var session = _sessionFactory.OpenSession())
     {
         using (var transaction = session.BeginTransaction())
         {
             session.Save(link);
             transaction.Commit();
         }
     }
 }
Example #6
0
        public void AddLinkToCrawl(LinkToCrawl link)
        {
            // if the link to add is NOT in the list of links to crawl or crawled links, then add it
            Thread.Sleep(100);
            var q = from l in LinksToCrawl.Values
                    where l.SessionId == link.SessionId &&
                    string.Compare(l.SourceUrl, link.SourceUrl, true) == 0 &&
                    string.Compare(l.TargetUrl, link.TargetUrl, true) == 0
                    select l;

            if (!q.Any())
            {
                var q2 = from l in CrawledLinks.Values
                         where l.SessionId == link.SessionId &&
                         string.Compare(l.SourceUrl, link.SourceUrl, true) == 0 &&
                         string.Compare(l.TargetUrl, link.TargetUrl, true) == 0
                         select l;

                if (!q2.Any())
                {
                    link.Id = NextId;
                    LinksToCrawl.Add(link.Id, link);
                }
            }
        }
Example #7
0
        /// <summary>
        /// Adds link as a link to crawl only after checking current links to
        /// crawl and links already crawled.  If link has already been crawled or
        /// is scheduled to be crawled, then it is added as a bypassed link.
        /// </summary>
        private void AddLinkToCrawlUnique(IRepository repo, LinkToCrawl link)
        {
            //If the link has already been crawled or it is scheduled to be crawled, then bypass
            if (repo.IsCrawled(link.SessionId, link.TargetUrl) || repo.IsToBeCrawled(link.SessionId, link.TargetUrl))
            {
                using (var factory = _provider.GetInstanceOf<IModelFactory>())
                {
                    var crawled = factory.CreateCrawledLink(link.SourceUrl, link.TargetUrl, SessionId, CrawlerId);
                    crawled.IsRoot = link.IsRoot;
                    crawled.CrawlDepth = link.CrawlDepth;
                    crawled.StatusCode = HttpStatusCode.OK;
                    crawled.Bypassed = true;
                    repo.AddCrawledLink(crawled, true);
                }
            }
            else
            {
                repo.AddLinkToCrawl(link);

                _logger.DebugFormat("AddLinkToCrawlUnique(): TargetUrl: {0}, SourceUrl: {1}, Root: {2}",
                    link.TargetUrl,
                    link.SourceUrl,
                    link.IsRoot);
            }
        }
Example #8
0
        /// <summary>
        /// If this method is called, then it assumes some pre-logic for links to avoid has already
        /// been applied and that the <paramref name="page"/> should be stored for future crawling.
        /// </summary>
        public void AddLinkToCrawl(LinkToCrawl link)
        {
            if (link == null)
                throw new ArgumentNullException("link");

            //_logger.DebugFormat("AddLinkToCrawl(): Target: {0}, Source: {1}, Root: {2}",
            //                    link.TargetUrl,
            //                    link.SourceUrl,
            //                    link.IsRoot);

            AddLinkToCrawlUnique(_repo, link);
        }