public virtual PageToCrawl ConvertToPageToCrawl(LinkToCrawl link, int crawlerId) { var page = new PageToCrawl(new Uri(link.TargetUrl)); page.PageBag.SessionId = link.SessionId; page.PageBag.CrawlerId = crawlerId; page.ParentUri = new Uri(link.SourceUrl); page.CrawlDepth = link.CrawlDepth; page.IsInternal = link.IsInternal; page.IsRoot = link.IsRoot; return page; }
public virtual LinkToCrawl ConvertToLinkToCrawl(PageToCrawl page, int sessionId) { var link = new LinkToCrawl(); link.SessionId = sessionId; link.SourceUrl = page.ParentUri.AbsoluteUri; link.TargetUrl = page.Uri.AbsoluteUri; link.TargetBaseDomain = page.Uri.GetBaseDomain(); link.CrawlDepth = page.CrawlDepth; link.IsRoot = page.IsRoot; link.IsInternal = page.IsInternal; return link; }
public static LinkToCrawl GetLinkToCrawl(string srcUrl, string targetUrl) { var link = new LinkToCrawl(); link.SessionId = 54; link.InProgress = true; link.SourceUrl = srcUrl; link.TargetUrl = targetUrl; link.TargetBaseDomain = "LL.Com"; link.CrawlDepth = 3; link.IsRoot = true; link.IsInternal = true; return link; }
public virtual LinkToCrawl CreateLinkToCrawl(CrawledPage page, Uri targetUri, int sessionId) { var link = new LinkToCrawl(); link.SessionId = sessionId; // this was the link that was just crawled to produce the CrawledPage link.SourceUrl = page.Uri.AbsoluteUri; // this is the link parsed that must be scheduled link.TargetUrl = targetUri.AbsoluteUri; link.TargetBaseDomain = targetUri.GetBaseDomain(); // creating a link from a crawled page, so it will not be the root link.IsRoot = false; link.IsInternal = string.Compare(page.Uri.GetBaseDomain(), targetUri.GetBaseDomain(), true) == 0; // increasing depth is also done in the default scheduler link.CrawlDepth = page.CrawlDepth + 1; return link; }
public void AddLinkToCrawl(LinkToCrawl link) { using (var session = _sessionFactory.OpenSession()) { using (var transaction = session.BeginTransaction()) { session.Save(link); transaction.Commit(); } } }
public void AddLinkToCrawl(LinkToCrawl link) { // if the link to add is NOT in the list of links to crawl or crawled links, then add it Thread.Sleep(100); var q = from l in LinksToCrawl.Values where l.SessionId == link.SessionId && string.Compare(l.SourceUrl, link.SourceUrl, true) == 0 && string.Compare(l.TargetUrl, link.TargetUrl, true) == 0 select l; if (!q.Any()) { var q2 = from l in CrawledLinks.Values where l.SessionId == link.SessionId && string.Compare(l.SourceUrl, link.SourceUrl, true) == 0 && string.Compare(l.TargetUrl, link.TargetUrl, true) == 0 select l; if (!q2.Any()) { link.Id = NextId; LinksToCrawl.Add(link.Id, link); } } }
/// <summary> /// Adds link as a link to crawl only after checking current links to /// crawl and links already crawled. If link has already been crawled or /// is scheduled to be crawled, then it is added as a bypassed link. /// </summary> private void AddLinkToCrawlUnique(IRepository repo, LinkToCrawl link) { //If the link has already been crawled or it is scheduled to be crawled, then bypass if (repo.IsCrawled(link.SessionId, link.TargetUrl) || repo.IsToBeCrawled(link.SessionId, link.TargetUrl)) { using (var factory = _provider.GetInstanceOf<IModelFactory>()) { var crawled = factory.CreateCrawledLink(link.SourceUrl, link.TargetUrl, SessionId, CrawlerId); crawled.IsRoot = link.IsRoot; crawled.CrawlDepth = link.CrawlDepth; crawled.StatusCode = HttpStatusCode.OK; crawled.Bypassed = true; repo.AddCrawledLink(crawled, true); } } else { repo.AddLinkToCrawl(link); _logger.DebugFormat("AddLinkToCrawlUnique(): TargetUrl: {0}, SourceUrl: {1}, Root: {2}", link.TargetUrl, link.SourceUrl, link.IsRoot); } }
/// <summary> /// If this method is called, then it assumes some pre-logic for links to avoid has already /// been applied and that the <paramref name="page"/> should be stored for future crawling. /// </summary> public void AddLinkToCrawl(LinkToCrawl link) { if (link == null) throw new ArgumentNullException("link"); //_logger.DebugFormat("AddLinkToCrawl(): Target: {0}, Source: {1}, Root: {2}", // link.TargetUrl, // link.SourceUrl, // link.IsRoot); AddLinkToCrawlUnique(_repo, link); }