public virtual CrawledLink CreateCrawledLink(string sourceUrl, string targetUrl, int sessionId, int crawlerId) { var link = new CrawledLink(); link.SessionId = sessionId; link.CrawlerId = crawlerId; link.SourceUrl = sourceUrl; link.TargetUrl = targetUrl; return(link); }
public Guid RecordCrawledLink(CrawledLink crawledLink) { _logger.DebugFormat("RecordCrawledLink(): TargetUrl: {0}, SourceUrl: {1}, Root: {2}", crawledLink.TargetUrl, crawledLink.SourceUrl, crawledLink.IsRoot); // record the crawled link _repo.AddCrawledLink(crawledLink, true); return(crawledLink.Id); }
public void AddCrawledLink(CrawledLink link, bool removeCorrespondingLinkToCrawl) { // delete from LinksToCrawl first Thread.Sleep(100); if (removeCorrespondingLinkToCrawl) { DeleteLinkToCrawl(link.SessionId, link.SourceUrl, link.TargetUrl); } link.Id = NextId; CrawledLinks.Add(link.Id, link); }
public virtual CrawledLink CreateCrawledLink(CrawledPage page, int sessionId, int crawlerId) { var link = new CrawledLink(); link.SessionId = page.PageBag.SessionId; link.CrawlerId = page.PageBag.CrawlerId; link.SourceUrl = page.ParentUri.AbsoluteUri; link.TargetUrl = page.Uri.AbsoluteUri; // what was crawled link.StatusCode = page.HttpWebResponse.StatusCode; link.IsRoot = page.IsRoot; link.CrawlDepth = page.CrawlDepth; return(link); }
public void AddCrawledLink(CrawledLink link, bool removeCorrespondingLinkToCrawl) { Thread.Sleep(100); using (var session = _sessionFactory.OpenSession()) { if (removeCorrespondingLinkToCrawl) { DeleteLinkToCrawl(link.SessionId, link.SourceUrl, link.TargetUrl); } using (var transaction = session.BeginTransaction()) { session.Save(link); transaction.Commit(); } } }
/// <summary> /// Processes the Uri specified by <paramref name="targetUri"/> as a potential link to be crawled, /// bypassed, or ignored. /// </summary> /// <param name="page">The CrawledPage from which the targetUri was parsed.</param> /// <param name="factory">An instance of IModelFactory</param> /// <param name="targetUri">The target Uri being processed</param> internal void ProcessLink(Abot.Poco.CrawledPage page, IModelFactory factory, Uri targetUri, int sessionId, int crawlerId) { CrawledLink bypassedLink = null; if (targetUri.Scheme == Uri.UriSchemeMailto) { // Mailto schema: bypass bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId); bypassedLink.IsRoot = false; bypassedLink.CrawlDepth = page.CrawlDepth + 1; bypassedLink.StatusCode = HttpStatusCode.OK; bypassedLink.Bypassed = true; LinksToByPass.Add(bypassedLink); } else if (string.Compare(page.Uri.AbsoluteUri, targetUri.AbsoluteUri) == 0) { // Exact self loops: bypass bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId); bypassedLink.IsRoot = false; bypassedLink.CrawlDepth = page.CrawlDepth + 1; bypassedLink.StatusCode = HttpStatusCode.OK; bypassedLink.Bypassed = true; LinksToByPass.Add(bypassedLink); } else if (MapOfLinksToCrawl.ContainsKey(targetUri.AbsoluteUri)) { // Duplicates: bypass bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId); bypassedLink.IsRoot = false; bypassedLink.CrawlDepth = page.CrawlDepth + 1; bypassedLink.StatusCode = HttpStatusCode.OK; bypassedLink.Bypassed = true; LinksToByPass.Add(bypassedLink); } else { // process link to be crawled that was parsed from a crawled page, so // it will not be a root. var link = factory.CreateLinkToCrawl(page, targetUri, sessionId); MapOfLinksToCrawl.Add(targetUri.AbsoluteUri, link); if (string.Compare(page.Uri.GetBaseDomain(), targetUri.GetBaseDomain(), true) != 0) { ExternalLinksFound |= true; } } }
public static CrawledLink GetCrawledLink(string srcUrl, string targetUrl) { var link = new CrawledLink(); link.SessionId = 54; link.CrawlerId = 64; link.SourceUrl = srcUrl; link.TargetUrl = targetUrl; link.StatusCode = System.Net.HttpStatusCode.Conflict; link.ErrorOccurred = true; link.Exception = new Exception("BLAH").ToString(); link.IsRoot = true; link.Bypassed = true; link.CrawlDepth = 3; return(link); }
public CrawledLink GetCrawledLink(int sessionId, int crawlerId, string srcUrl, string targetUrl) { CrawledLink result = null; using (var session = _sessionFactory.OpenSession()) { using (var transaction = session.BeginTransaction()) { var q = session.Query <CrawledLink>() .Where(x => x.SessionId == sessionId) .Where(x => x.CrawlerId == crawlerId) .Where(x => x.SourceUrl == srcUrl) .Where(x => x.TargetUrl == targetUrl); result = q.FirstOrDefault(); transaction.Commit(); } } return(result); }
public void ProcessLinks(Abot.Poco.CrawledPage page) { if (page.ParsedLinks == null || page.ParsedLinks.Count() == 0) { _logger.DebugFormat("CrawledPage contained 0 parsed links"); LinksToCrawl = new List <LinkToCrawl>(); LinksToByPass = new List <CrawledLink>(); return; } LinksToByPass = new List <CrawledLink>(); MapOfLinksToCrawl = new Dictionary <string, LinkToCrawl>(); using (var factory = _provider.GetInstanceOf <IModelFactory>()) { var sessionId = page.PageBag.SessionId; var crawlerId = page.PageBag.CrawlerId; LinkToCrawl link = null; CrawledLink bypassedLink = null; foreach (var targetUri in page.ParsedLinks) { ProcessLink(page, factory, targetUri, sessionId, crawlerId); } LinksToCrawl = MapOfLinksToCrawl.Values.ToList(); MapOfLinksToCrawl.Clear(); MapOfLinksToCrawl = null; if (_logger.IsDebugEnabled) { _logger.DebugFormat("TargetUrls of new LinksToCrawl: {0}", String.Join("; ", LinksToCrawl.Select(o => o.TargetUrl))); _logger.DebugFormat("TargetUrls of new LinksToByPass: {0}", String.Join("; ", LinksToByPass.Select(o => o.TargetUrl))); } } }
private void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { CrawledPage crawledPage = e.CrawledPage; bool externalLinksFound = false; _logger.DebugFormat("Page Crawl Completed {0}; Status {1}; Source URL: {2}; CrawlerId: {3}; SessionId: {4}", crawledPage.Uri.AbsoluteUri, crawledPage.HttpWebResponse.StatusCode, crawledPage.ParentUri.AbsoluteUri, crawledPage.PageBag.CrawlerId, crawledPage.PageBag.SessionId); //---------------------------------------- // create and store the crawled link var crawledLink = new CrawledLink(); crawledLink.SessionId = crawledPage.PageBag.SessionId; crawledLink.CrawlerId = crawledPage.PageBag.CrawlerId; crawledLink.SourceUrl = crawledPage.ParentUri.AbsoluteUri; crawledLink.TargetUrl = crawledPage.Uri.AbsoluteUri; // what was crawled crawledLink.StatusCode = crawledPage.HttpWebResponse.StatusCode; crawledLink.IsRoot = crawledPage.IsRoot; crawledLink.CrawlDepth = crawledPage.CrawlDepth; //------------ if (crawledPage.WebException != null) { // store error information if it occurred crawledLink.ErrorOccurred = true; crawledLink.Exception = crawledPage.WebException.Message; //TODO store more data of the exception _logger.Error(string.Format("A WebException occurred for Target Url: {0}; Source URL: {1}; CrawlerId: {2}; SessionId: {3}", crawledLink.TargetUrl, crawledLink.SourceUrl, crawledLink.CrawlerId, crawledLink.SessionId), crawledPage.WebException); } _scheduler.RecordCrawledLink(crawledLink); //---------------------------------------- // Check if the page should be processed, if true process it // - extract the title, keywords, description, cookies, etc from the page // and save processed data. if (crawledPage.WebException == null) { if (IsPageToBeProcessed(crawledPage.Uri, crawledPage.HttpWebResponse.StatusCode)) { using (var processor = _provider.GetInstanceOf <ICrawledPageProcessor>()) { var result = processor.ProcessPage(crawledPage); _repo.AddProcessedPage(result); } } externalLinksFound = _scheduler.ProcessParsedLinks(crawledPage); if (externalLinksFound) { OnExternalLinksFound(CrawlerId, crawledPage.Uri); } } string mssg = null; if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) { mssg = string.Format("Crawl of page failed {0}; source: {1}", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri.AbsoluteUri); _logger.Error(mssg); } else { mssg = string.Format("Crawl of page succeeded {0}; source: {1}", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri.AbsoluteUri); _logger.Debug(mssg); } if (string.IsNullOrEmpty(crawledPage.Content.Text)) { mssg = string.Format("Page had no content {0}", crawledPage.Uri.AbsoluteUri); _logger.Error(mssg); } //------------ OnLinkCrawlCompleted(CrawlerDefinition, crawledPage.ParentUri.AbsoluteUri, crawledPage.Uri.AbsoluteUri, crawledPage.HttpWebResponse.StatusCode, crawledPage.WebException != null, externalLinksFound); }