public virtual CrawledLink CreateCrawledLink(string sourceUrl, string targetUrl, int sessionId, int crawlerId) { var link = new CrawledLink(); link.SessionId = sessionId; link.CrawlerId = crawlerId; link.SourceUrl = sourceUrl; link.TargetUrl = targetUrl; return link; }
public virtual CrawledLink CreateCrawledLink(CrawledPage page, int sessionId, int crawlerId) { var link = new CrawledLink(); link.SessionId = page.PageBag.SessionId; link.CrawlerId = page.PageBag.CrawlerId; link.SourceUrl = page.ParentUri.AbsoluteUri; link.TargetUrl = page.Uri.AbsoluteUri; // what was crawled link.StatusCode = page.HttpWebResponse.StatusCode; link.IsRoot = page.IsRoot; link.CrawlDepth = page.CrawlDepth; return link; }
public static CrawledLink GetCrawledLink(string srcUrl, string targetUrl) { var link = new CrawledLink(); link.SessionId = 54; link.CrawlerId = 64; link.SourceUrl = srcUrl; link.TargetUrl = targetUrl; link.StatusCode = System.Net.HttpStatusCode.Conflict; link.ErrorOccurred = true; link.Exception = new Exception("BLAH").ToString(); link.IsRoot = true; link.Bypassed = true; link.CrawlDepth = 3; return link; }
public void AddCrawledLink(CrawledLink link, bool removeCorrespondingLinkToCrawl) { Thread.Sleep(100); using (var session = _sessionFactory.OpenSession()) { if (removeCorrespondingLinkToCrawl) DeleteLinkToCrawl(link.SessionId, link.SourceUrl, link.TargetUrl); using (var transaction = session.BeginTransaction()) { session.Save(link); transaction.Commit(); } } }
public void AddCrawledLink(CrawledLink link, bool removeCorrespondingLinkToCrawl) { // delete from LinksToCrawl first Thread.Sleep(100); if (removeCorrespondingLinkToCrawl) DeleteLinkToCrawl(link.SessionId, link.SourceUrl, link.TargetUrl); link.Id = NextId; CrawledLinks.Add(link.Id, link); }
private void crawler_ProcessPageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { CrawledPage crawledPage = e.CrawledPage; bool externalLinksFound = false; _logger.DebugFormat("Page Crawl Completed {0}; Status {1}; Source URL: {2}; CrawlerId: {3}; SessionId: {4}", crawledPage.Uri.AbsoluteUri, crawledPage.HttpWebResponse.StatusCode, crawledPage.ParentUri.AbsoluteUri, crawledPage.PageBag.CrawlerId, crawledPage.PageBag.SessionId); //---------------------------------------- // create and store the crawled link var crawledLink = new CrawledLink(); crawledLink.SessionId = crawledPage.PageBag.SessionId; crawledLink.CrawlerId = crawledPage.PageBag.CrawlerId; crawledLink.SourceUrl = crawledPage.ParentUri.AbsoluteUri; crawledLink.TargetUrl = crawledPage.Uri.AbsoluteUri; // what was crawled crawledLink.StatusCode = crawledPage.HttpWebResponse.StatusCode; crawledLink.IsRoot = crawledPage.IsRoot; crawledLink.CrawlDepth = crawledPage.CrawlDepth; //------------ if (crawledPage.WebException != null) { // store error information if it occurred crawledLink.ErrorOccurred = true; crawledLink.Exception = crawledPage.WebException.Message; //TODO store more data of the exception _logger.Error(string.Format("A WebException occurred for Target Url: {0}; Source URL: {1}; CrawlerId: {2}; SessionId: {3}", crawledLink.TargetUrl, crawledLink.SourceUrl, crawledLink.CrawlerId, crawledLink.SessionId), crawledPage.WebException); } _scheduler.RecordCrawledLink(crawledLink); //---------------------------------------- // Check if the page should be processed, if true process it // - extract the title, keywords, description, cookies, etc from the page // and save processed data. if (crawledPage.WebException == null) { if (IsPageToBeProcessed(crawledPage.Uri, crawledPage.HttpWebResponse.StatusCode)) { using (var processor = _provider.GetInstanceOf<ICrawledPageProcessor>()) { var result = processor.ProcessPage(crawledPage); _repo.AddProcessedPage(result); } } externalLinksFound = _scheduler.ProcessParsedLinks(crawledPage); if (externalLinksFound) { OnExternalLinksFound(CrawlerId, crawledPage.Uri); } } string mssg = null; if (crawledPage.WebException != null || crawledPage.HttpWebResponse.StatusCode != HttpStatusCode.OK) { mssg = string.Format("Crawl of page failed {0}; source: {1}", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri.AbsoluteUri); _logger.Error(mssg); } else { mssg = string.Format("Crawl of page succeeded {0}; source: {1}", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri.AbsoluteUri); _logger.Debug(mssg); } if (string.IsNullOrEmpty(crawledPage.Content.Text)) { mssg = string.Format("Page had no content {0}", crawledPage.Uri.AbsoluteUri); _logger.Error(mssg); } //------------ OnLinkCrawlCompleted(CrawlerDefinition, crawledPage.ParentUri.AbsoluteUri, crawledPage.Uri.AbsoluteUri, crawledPage.HttpWebResponse.StatusCode, crawledPage.WebException != null, externalLinksFound); }
public Guid RecordCrawledLink(CrawledLink crawledLink) { _logger.DebugFormat("RecordCrawledLink(): TargetUrl: {0}, SourceUrl: {1}, Root: {2}", crawledLink.TargetUrl, crawledLink.SourceUrl, crawledLink.IsRoot); // record the crawled link _repo.AddCrawledLink(crawledLink, true); return crawledLink.Id; }