示例#1
0
        /// <summary>
        /// Adds link as a link to crawl only after checking current links to
        /// crawl and links already crawled.  If link has already been crawled or
        /// is scheduled to be crawled, then it is added as a bypassed link.
        /// </summary>
        private void AddLinkToCrawlUnique(IRepository repo, LinkToCrawl link)
        {
            //If the link has already been crawled or it is scheduled to be crawled, then bypass
            if (repo.IsCrawled(link.SessionId, link.TargetUrl) || repo.IsToBeCrawled(link.SessionId, link.TargetUrl))
            {
                using (var factory = _provider.GetInstanceOf <IModelFactory>())
                {
                    var crawled = factory.CreateCrawledLink(link.SourceUrl, link.TargetUrl, SessionId, CrawlerId);
                    crawled.IsRoot     = link.IsRoot;
                    crawled.CrawlDepth = link.CrawlDepth;
                    crawled.StatusCode = HttpStatusCode.OK;
                    crawled.Bypassed   = true;
                    repo.AddCrawledLink(crawled, true);
                }
            }
            else
            {
                repo.AddLinkToCrawl(link);

                _logger.DebugFormat("AddLinkToCrawlUnique(): TargetUrl: {0}, SourceUrl: {1}, Root: {2}",
                                    link.TargetUrl,
                                    link.SourceUrl,
                                    link.IsRoot);
            }
        }
示例#2
0
        public LinkToCrawl GetNextLinkToCrawl(int sessionId, string baseDomain, bool markAsInProgress)
        {
            LinkToCrawl result = null;

            using (var session = _sessionFactory.OpenSession())
            {
                using (var transaction = session.BeginTransaction())
                {
                    var q = session.Query <LinkToCrawl>()
                            .Where(x => x.SessionId == sessionId)
                            .Where(x => x.TargetBaseDomain == baseDomain)
                            .Where(x => x.InProgress == false);
                    result = q.FirstOrDefault();
                    transaction.Commit();
                }
            }

            if (result != null && markAsInProgress)
            {
                var query = string.Format("UPDATE [LinkToCrawl] SET [InProgress] = 1 WHERE [Id] = '{0}';",
                                          result.Id);

                using (var connection = new SqlConnection(_connStr))
                {
                    var command = new SqlCommand(query, connection)
                    {
                        CommandType = CommandType.Text
                    };
                    command.Connection.Open();
                    command.ExecuteNonQuery();
                }
            }

            return(result);
        }
示例#3
0
        public void AddLinkToCrawl(LinkToCrawl link)
        {
            // if the link to add is NOT in the list of links to crawl or crawled links, then add it
            Thread.Sleep(100);
            var q = from l in LinksToCrawl.Values
                    where l.SessionId == link.SessionId &&
                    string.Compare(l.SourceUrl, link.SourceUrl, true) == 0 &&
                    string.Compare(l.TargetUrl, link.TargetUrl, true) == 0
                    select l;

            if (!q.Any())
            {
                var q2 = from l in CrawledLinks.Values
                         where l.SessionId == link.SessionId &&
                         string.Compare(l.SourceUrl, link.SourceUrl, true) == 0 &&
                         string.Compare(l.TargetUrl, link.TargetUrl, true) == 0
                         select l;

                if (!q2.Any())
                {
                    link.Id = NextId;
                    LinksToCrawl.Add(link.Id, link);
                }
            }
        }
示例#4
0
 public void AddLinkToCrawl(LinkToCrawl link)
 {
     using (var session = _sessionFactory.OpenSession())
     {
         using (var transaction = session.BeginTransaction())
         {
             session.Save(link);
             transaction.Commit();
         }
     }
 }
示例#5
0
        public virtual PageToCrawl ConvertToPageToCrawl(LinkToCrawl link, int crawlerId)
        {
            var page = new PageToCrawl(new Uri(link.TargetUrl));

            page.PageBag.SessionId = link.SessionId;
            page.PageBag.CrawlerId = crawlerId;
            page.ParentUri         = new Uri(link.SourceUrl);
            page.CrawlDepth        = link.CrawlDepth;
            page.IsInternal        = link.IsInternal;
            page.IsRoot            = link.IsRoot;
            return(page);
        }
示例#6
0
        public virtual LinkToCrawl ConvertToLinkToCrawl(PageToCrawl page, int sessionId)
        {
            var link = new LinkToCrawl();

            link.SessionId        = sessionId;
            link.SourceUrl        = page.ParentUri.AbsoluteUri;
            link.TargetUrl        = page.Uri.AbsoluteUri;
            link.TargetBaseDomain = page.Uri.GetBaseDomain();
            link.CrawlDepth       = page.CrawlDepth;
            link.IsRoot           = page.IsRoot;
            link.IsInternal       = page.IsInternal;
            return(link);
        }
示例#7
0
        /// <summary>
        /// If this method is called, then it assumes some pre-logic for links to avoid has already
        /// been applied and that the <paramref name="page"/> should be stored for future crawling.
        /// </summary>
        public void AddLinkToCrawl(LinkToCrawl link)
        {
            if (link == null)
            {
                throw new ArgumentNullException("link");
            }

            //_logger.DebugFormat("AddLinkToCrawl(): Target: {0}, Source: {1}, Root: {2}",
            //                    link.TargetUrl,
            //                    link.SourceUrl,
            //                    link.IsRoot);

            AddLinkToCrawlUnique(_repo, link);
        }
示例#8
0
        public static LinkToCrawl GetLinkToCrawl(string srcUrl, string targetUrl)
        {
            var link = new LinkToCrawl();

            link.SessionId        = 54;
            link.InProgress       = true;
            link.SourceUrl        = srcUrl;
            link.TargetUrl        = targetUrl;
            link.TargetBaseDomain = "LL.Com";
            link.CrawlDepth       = 3;
            link.IsRoot           = true;
            link.IsInternal       = true;

            return(link);
        }
示例#9
0
        public virtual LinkToCrawl CreateLinkToCrawl(CrawledPage page, Uri targetUri, int sessionId)
        {
            var link = new LinkToCrawl();

            link.SessionId = sessionId;
            // this was the link that was just crawled to produce the CrawledPage
            link.SourceUrl = page.Uri.AbsoluteUri;
            // this is the link parsed that must be scheduled
            link.TargetUrl        = targetUri.AbsoluteUri;
            link.TargetBaseDomain = targetUri.GetBaseDomain();
            // creating a link from a crawled page, so it will not be the root
            link.IsRoot     = false;
            link.IsInternal = string.Compare(page.Uri.GetBaseDomain(), targetUri.GetBaseDomain(), true) == 0;
            // increasing depth is also done in the default scheduler
            link.CrawlDepth = page.CrawlDepth + 1;
            return(link);
        }
示例#10
0
        public LinkToCrawl GetLinkToCrawl(int sessionId, string srcUrl, string targetUrl)
        {
            LinkToCrawl result = null;

            using (var session = _sessionFactory.OpenSession())
            {
                using (var transaction = session.BeginTransaction())
                {
                    var q = session.Query <LinkToCrawl>()
                            .Where(x => x.SessionId == sessionId)
                            .Where(x => x.SourceUrl == srcUrl)
                            .Where(x => x.TargetUrl == targetUrl);
                    result = q.FirstOrDefault();
                    transaction.Commit();
                }
            }
            return(result);
        }
示例#11
0
        public void Add(IEnumerable <PageToCrawl> pages)
        {
            if (pages == null)
            {
                throw new ArgumentNullException("pages");
            }

            using (var factory = _provider.GetInstanceOf <IModelFactory>())
            {
                List <LinkToCrawl> links = new List <LinkToCrawl>();
                LinkToCrawl        link  = null;
                foreach (var page in pages)
                {
                    //_logger.DebugFormat("Add(pages): Target: {0}, Source: {1}, Root: {2}",
                    //    page.Uri.AbsoluteUri,
                    //    page.ParentUri.AbsoluteUri,
                    //    page.IsRoot);

                    link = factory.ConvertToLinkToCrawl(page, SessionId);
                    links.Add(link);
                }
                AddLinksToCrawl(links);
            }
        }
示例#12
0
        public void ProcessLinks(Abot.Poco.CrawledPage page)
        {
            if (page.ParsedLinks == null || page.ParsedLinks.Count() == 0)
            {
                _logger.DebugFormat("CrawledPage contained 0 parsed links");
                LinksToCrawl  = new List <LinkToCrawl>();
                LinksToByPass = new List <CrawledLink>();
                return;
            }

            LinksToByPass     = new List <CrawledLink>();
            MapOfLinksToCrawl = new Dictionary <string, LinkToCrawl>();

            using (var factory = _provider.GetInstanceOf <IModelFactory>())
            {
                var         sessionId    = page.PageBag.SessionId;
                var         crawlerId    = page.PageBag.CrawlerId;
                LinkToCrawl link         = null;
                CrawledLink bypassedLink = null;
                foreach (var targetUri in page.ParsedLinks)
                {
                    ProcessLink(page, factory, targetUri, sessionId, crawlerId);
                }

                LinksToCrawl = MapOfLinksToCrawl.Values.ToList();
                MapOfLinksToCrawl.Clear();
                MapOfLinksToCrawl = null;
                if (_logger.IsDebugEnabled)
                {
                    _logger.DebugFormat("TargetUrls of new LinksToCrawl: {0}",
                                        String.Join("; ", LinksToCrawl.Select(o => o.TargetUrl)));
                    _logger.DebugFormat("TargetUrls of new LinksToByPass: {0}",
                                        String.Join("; ", LinksToByPass.Select(o => o.TargetUrl)));
                }
            }
        }