/// <summary> /// Processes a crawled page and returns a ProcessedPage object /// which can be stored. /// </summary> /// <param name="page">The results of a crawled url</param> /// <returns>ProcessedPage or null</returns> public ProcessedPage ProcessPage(Abot.Poco.CrawledPage page) { //TODO extract data var processed = new ProcessedPage(); processed.SessionId = page.PageBag.SessionId; processed.CrawlerId = page.PageBag.CrawlerId; processed.PageUrl = page.Uri.AbsoluteUri; processed.StatusCode = page.HttpWebResponse.StatusCode; //TODO store cookies var cookies = page.HttpWebResponse.Cookies; return processed; }
public void ProcessLinks(Abot.Poco.CrawledPage page) { if (page.ParsedLinks == null || page.ParsedLinks.Count() == 0) { _logger.DebugFormat("CrawledPage contained 0 parsed links"); LinksToCrawl = new List<LinkToCrawl>(); LinksToByPass = new List<CrawledLink>(); return; } LinksToByPass = new List<CrawledLink>(); MapOfLinksToCrawl = new Dictionary<string, LinkToCrawl>(); using (var factory = _provider.GetInstanceOf<IModelFactory>()) { var sessionId = page.PageBag.SessionId; var crawlerId = page.PageBag.CrawlerId; LinkToCrawl link = null; CrawledLink bypassedLink = null; foreach (var targetUri in page.ParsedLinks) { ProcessLink(page, factory, targetUri, sessionId, crawlerId); } LinksToCrawl = MapOfLinksToCrawl.Values.ToList(); MapOfLinksToCrawl.Clear(); MapOfLinksToCrawl = null; if (_logger.IsDebugEnabled) { _logger.DebugFormat("TargetUrls of new LinksToCrawl: {0}", String.Join("; ", LinksToCrawl.Select(o => o.TargetUrl))); _logger.DebugFormat("TargetUrls of new LinksToByPass: {0}", String.Join("; ", LinksToByPass.Select(o => o.TargetUrl))); } } }
/// <summary> /// Processes the Uri specified by <paramref name="targetUri"/> as a potential link to be crawled, /// bypassed, or ignored. /// </summary> /// <param name="page">The CrawledPage from which the targetUri was parsed.</param> /// <param name="factory">An instance of IModelFactory</param> /// <param name="targetUri">The target Uri being processed</param> internal void ProcessLink(Abot.Poco.CrawledPage page, IModelFactory factory, Uri targetUri, int sessionId, int crawlerId) { CrawledLink bypassedLink = null; if (targetUri.Scheme == Uri.UriSchemeMailto) { // Mailto schema: bypass bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId); bypassedLink.IsRoot = false; bypassedLink.CrawlDepth = page.CrawlDepth + 1; bypassedLink.StatusCode = HttpStatusCode.OK; bypassedLink.Bypassed = true; LinksToByPass.Add(bypassedLink); } else if (string.Compare(page.Uri.AbsoluteUri, targetUri.AbsoluteUri) == 0) { // Exact self loops: bypass bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId); bypassedLink.IsRoot = false; bypassedLink.CrawlDepth = page.CrawlDepth + 1; bypassedLink.StatusCode = HttpStatusCode.OK; bypassedLink.Bypassed = true; LinksToByPass.Add(bypassedLink); } else if (MapOfLinksToCrawl.ContainsKey(targetUri.AbsoluteUri)) { // Duplicates: bypass bypassedLink = factory.CreateCrawledLink(page.Uri, targetUri, sessionId, crawlerId); bypassedLink.IsRoot = false; bypassedLink.CrawlDepth = page.CrawlDepth + 1; bypassedLink.StatusCode = HttpStatusCode.OK; bypassedLink.Bypassed = true; LinksToByPass.Add(bypassedLink); } else { // process link to be crawled that was parsed from a crawled page, so // it will not be a root. var link = factory.CreateLinkToCrawl(page, targetUri, sessionId); MapOfLinksToCrawl.Add(targetUri.AbsoluteUri, link); if (string.Compare(page.Uri.GetBaseDomain(), targetUri.GetBaseDomain(), true) != 0) ExternalLinksFound |= true; } }