/// <summary> /// Adds link as a link to crawl only after checking current links to /// crawl and links already crawled. If link has already been crawled or /// is scheduled to be crawled, then it is added as a bypassed link. /// </summary> private void AddLinkToCrawlUnique(IRepository repo, LinkToCrawl link) { //If the link has already been crawled or it is scheduled to be crawled, then bypass if (repo.IsCrawled(link.SessionId, link.TargetUrl) || repo.IsToBeCrawled(link.SessionId, link.TargetUrl)) { using (var factory = _provider.GetInstanceOf <IModelFactory>()) { var crawled = factory.CreateCrawledLink(link.SourceUrl, link.TargetUrl, SessionId, CrawlerId); crawled.IsRoot = link.IsRoot; crawled.CrawlDepth = link.CrawlDepth; crawled.StatusCode = HttpStatusCode.OK; crawled.Bypassed = true; repo.AddCrawledLink(crawled, true); } } else { repo.AddLinkToCrawl(link); _logger.DebugFormat("AddLinkToCrawlUnique(): TargetUrl: {0}, SourceUrl: {1}, Root: {2}", link.TargetUrl, link.SourceUrl, link.IsRoot); } }
public LinkToCrawl GetNextLinkToCrawl(int sessionId, string baseDomain, bool markAsInProgress) { LinkToCrawl result = null; using (var session = _sessionFactory.OpenSession()) { using (var transaction = session.BeginTransaction()) { var q = session.Query <LinkToCrawl>() .Where(x => x.SessionId == sessionId) .Where(x => x.TargetBaseDomain == baseDomain) .Where(x => x.InProgress == false); result = q.FirstOrDefault(); transaction.Commit(); } } if (result != null && markAsInProgress) { var query = string.Format("UPDATE [LinkToCrawl] SET [InProgress] = 1 WHERE [Id] = '{0}';", result.Id); using (var connection = new SqlConnection(_connStr)) { var command = new SqlCommand(query, connection) { CommandType = CommandType.Text }; command.Connection.Open(); command.ExecuteNonQuery(); } } return(result); }
public void AddLinkToCrawl(LinkToCrawl link) { // if the link to add is NOT in the list of links to crawl or crawled links, then add it Thread.Sleep(100); var q = from l in LinksToCrawl.Values where l.SessionId == link.SessionId && string.Compare(l.SourceUrl, link.SourceUrl, true) == 0 && string.Compare(l.TargetUrl, link.TargetUrl, true) == 0 select l; if (!q.Any()) { var q2 = from l in CrawledLinks.Values where l.SessionId == link.SessionId && string.Compare(l.SourceUrl, link.SourceUrl, true) == 0 && string.Compare(l.TargetUrl, link.TargetUrl, true) == 0 select l; if (!q2.Any()) { link.Id = NextId; LinksToCrawl.Add(link.Id, link); } } }
public void AddLinkToCrawl(LinkToCrawl link) { using (var session = _sessionFactory.OpenSession()) { using (var transaction = session.BeginTransaction()) { session.Save(link); transaction.Commit(); } } }
public virtual PageToCrawl ConvertToPageToCrawl(LinkToCrawl link, int crawlerId) { var page = new PageToCrawl(new Uri(link.TargetUrl)); page.PageBag.SessionId = link.SessionId; page.PageBag.CrawlerId = crawlerId; page.ParentUri = new Uri(link.SourceUrl); page.CrawlDepth = link.CrawlDepth; page.IsInternal = link.IsInternal; page.IsRoot = link.IsRoot; return(page); }
public virtual LinkToCrawl ConvertToLinkToCrawl(PageToCrawl page, int sessionId) { var link = new LinkToCrawl(); link.SessionId = sessionId; link.SourceUrl = page.ParentUri.AbsoluteUri; link.TargetUrl = page.Uri.AbsoluteUri; link.TargetBaseDomain = page.Uri.GetBaseDomain(); link.CrawlDepth = page.CrawlDepth; link.IsRoot = page.IsRoot; link.IsInternal = page.IsInternal; return(link); }
/// <summary> /// If this method is called, then it assumes some pre-logic for links to avoid has already /// been applied and that the <paramref name="page"/> should be stored for future crawling. /// </summary> public void AddLinkToCrawl(LinkToCrawl link) { if (link == null) { throw new ArgumentNullException("link"); } //_logger.DebugFormat("AddLinkToCrawl(): Target: {0}, Source: {1}, Root: {2}", // link.TargetUrl, // link.SourceUrl, // link.IsRoot); AddLinkToCrawlUnique(_repo, link); }
public static LinkToCrawl GetLinkToCrawl(string srcUrl, string targetUrl) { var link = new LinkToCrawl(); link.SessionId = 54; link.InProgress = true; link.SourceUrl = srcUrl; link.TargetUrl = targetUrl; link.TargetBaseDomain = "LL.Com"; link.CrawlDepth = 3; link.IsRoot = true; link.IsInternal = true; return(link); }
public virtual LinkToCrawl CreateLinkToCrawl(CrawledPage page, Uri targetUri, int sessionId) { var link = new LinkToCrawl(); link.SessionId = sessionId; // this was the link that was just crawled to produce the CrawledPage link.SourceUrl = page.Uri.AbsoluteUri; // this is the link parsed that must be scheduled link.TargetUrl = targetUri.AbsoluteUri; link.TargetBaseDomain = targetUri.GetBaseDomain(); // creating a link from a crawled page, so it will not be the root link.IsRoot = false; link.IsInternal = string.Compare(page.Uri.GetBaseDomain(), targetUri.GetBaseDomain(), true) == 0; // increasing depth is also done in the default scheduler link.CrawlDepth = page.CrawlDepth + 1; return(link); }
public LinkToCrawl GetLinkToCrawl(int sessionId, string srcUrl, string targetUrl) { LinkToCrawl result = null; using (var session = _sessionFactory.OpenSession()) { using (var transaction = session.BeginTransaction()) { var q = session.Query <LinkToCrawl>() .Where(x => x.SessionId == sessionId) .Where(x => x.SourceUrl == srcUrl) .Where(x => x.TargetUrl == targetUrl); result = q.FirstOrDefault(); transaction.Commit(); } } return(result); }
public void Add(IEnumerable <PageToCrawl> pages) { if (pages == null) { throw new ArgumentNullException("pages"); } using (var factory = _provider.GetInstanceOf <IModelFactory>()) { List <LinkToCrawl> links = new List <LinkToCrawl>(); LinkToCrawl link = null; foreach (var page in pages) { //_logger.DebugFormat("Add(pages): Target: {0}, Source: {1}, Root: {2}", // page.Uri.AbsoluteUri, // page.ParentUri.AbsoluteUri, // page.IsRoot); link = factory.ConvertToLinkToCrawl(page, SessionId); links.Add(link); } AddLinksToCrawl(links); } }
public void ProcessLinks(Abot.Poco.CrawledPage page) { if (page.ParsedLinks == null || page.ParsedLinks.Count() == 0) { _logger.DebugFormat("CrawledPage contained 0 parsed links"); LinksToCrawl = new List <LinkToCrawl>(); LinksToByPass = new List <CrawledLink>(); return; } LinksToByPass = new List <CrawledLink>(); MapOfLinksToCrawl = new Dictionary <string, LinkToCrawl>(); using (var factory = _provider.GetInstanceOf <IModelFactory>()) { var sessionId = page.PageBag.SessionId; var crawlerId = page.PageBag.CrawlerId; LinkToCrawl link = null; CrawledLink bypassedLink = null; foreach (var targetUri in page.ParsedLinks) { ProcessLink(page, factory, targetUri, sessionId, crawlerId); } LinksToCrawl = MapOfLinksToCrawl.Values.ToList(); MapOfLinksToCrawl.Clear(); MapOfLinksToCrawl = null; if (_logger.IsDebugEnabled) { _logger.DebugFormat("TargetUrls of new LinksToCrawl: {0}", String.Join("; ", LinksToCrawl.Select(o => o.TargetUrl))); _logger.DebugFormat("TargetUrls of new LinksToByPass: {0}", String.Join("; ", LinksToByPass.Select(o => o.TargetUrl))); } } }