private void Crawl(CrawlablePage page) { var resOption = _crawler.Crawl(page); resOption.IfSome(res => { _pageCrawledNotifier.Notify(res); _crawlResultSaver.Save(res); _crawlablePagesRepository.Delete(res.Url); }); }
private void ScanNowSync() { foreach (var searchResult in _crawler.Crawl()) { foreach (var fileId in searchResult.Files) { FileToMachineHandler.StoreIntoLookUp(fileId, searchResult.FileLocation); } ReverseIndexHandler.StoreLookUp(searchResult.Word, searchResult.Files); } }
public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK || propertyBag.Response == null || propertyBag.Response.Length == 0) { return(Task.FromResult(true)); } if (!IsXmlContent(propertyBag.ContentType)) { return(Task.FromResult(true)); } using (MemoryStream ms = new MemoryStream(propertyBag.Response)) { XDocument mydoc = XDocument.Load(ms); if (mydoc.Root == null) { return(Task.FromResult(true)); } XName qualifiedName = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9"); IEnumerable <string> urlNodes = from e in mydoc.Descendants(qualifiedName) where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase) select e.Value; foreach (string url in urlNodes) { // add new crawler steps string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(url); string normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } propertyBag["PropertyBagKeyOriginalUrl"].Value = url; propertyBag["PropertyBagKeyOriginalReferrerUrl"].Value = propertyBag.ResponseUri; crawler.Crawl(new Uri(normalizedLink), propertyBag); } } return(Task.FromResult(true)); }
public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK || propertyBag.Response == null || propertyBag.Response.Length == 0) { return Task.FromResult(true); } if (!IsXmlContent(propertyBag.ContentType)) { return Task.FromResult(true); } using (MemoryStream ms = new MemoryStream(propertyBag.Response)) { XDocument mydoc = XDocument.Load(ms); if (mydoc.Root == null) { return Task.FromResult(true); } XName qualifiedName = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9"); IEnumerable<string> urlNodes = from e in mydoc.Descendants(qualifiedName) where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase) select e.Value; foreach (string url in urlNodes) { // add new crawler steps string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(url); string normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } propertyBag["PropertyBagKeyOriginalUrl"].Value = url; propertyBag["PropertyBagKeyOriginalReferrerUrl"].Value = propertyBag.ResponseUri; crawler.Crawl(new Uri(normalizedLink), propertyBag); } } return Task.FromResult(true); }
public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode == HttpStatusCode.OK && IsTextContent(propertyBag.ContentType)) { string content = Encoding.UTF8.GetString(propertyBag.Response); propertyBag.Title = propertyBag.Step.Uri.ToString(); propertyBag.Text = content.Trim(); MatchCollection urlMatches = _urlMatcher.Matches(propertyBag.Text); foreach (Match urlMatch in urlMatches) { Uri uri; if (Uri.TryCreate(urlMatch.Value, UriKind.Absolute, out uri)) { crawler.Crawl(uri, propertyBag); } } } return(Task.FromResult(true)); }
public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode == HttpStatusCode.OK && IsTextContent(propertyBag.ContentType)) { string content = Encoding.UTF8.GetString(propertyBag.Response); propertyBag.Title = propertyBag.Step.Uri.ToString(); propertyBag.Text = content.Trim(); MatchCollection urlMatches = _urlMatcher.Matches(propertyBag.Text); foreach (Match urlMatch in urlMatches) { Uri uri; if (Uri.TryCreate(urlMatch.Value, UriKind.Absolute, out uri)) { crawler.Crawl(uri, propertyBag); } } } return Task.FromResult(true); }
private async Task StartSiteSpecificDownloader(QueueListItem queueListItem, CancellationToken ct, PauseToken pt) { IBlog blog = queueListItem.Blog; blog.Dirty = true; ProgressThrottler <DownloadProgress> progress = SetupThrottledQueueListProgress(queueListItem); ICrawler crawler = CrawlerFactory.GetCrawler(blog.BlogType, ct, pt, progress, shellService, crawlerService, blog); await crawler.Crawl(); if (ct.IsCancellationRequested) { Monitor.Enter(lockObject); QueueOnDispatcher.CheckBeginInvokeOnUI(() => crawlerService.RemoveActiveItem(queueListItem)); Monitor.Exit(lockObject); } else { Monitor.Enter(lockObject); QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(queueListItem)); QueueOnDispatcher.CheckBeginInvokeOnUI(() => crawlerService.RemoveActiveItem(queueListItem)); Monitor.Exit(lockObject); } }
private static void Main() { InitConfig(); _logger = new ConsoleLogger(); InitCrawler(); var watch = Stopwatch.StartNew(); //var crawler = new HtmlCrawler(_logger, _depth, _baseUrl); _crawler.Crawl(0, _url); var pages = _crawler.GetPages(); new Validator.Validator(new RestClient()).Validate(pages.Values); watch.Stop(); LogCrawlErrors(pages, _logger); _logger.Log($"Found: {pages.Count}"); _logger.Log("Search time: " + watch.Elapsed); SaveToJson(pages); }
public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag) { AspectF.Define .NotNull(crawler, nameof(crawler)) .NotNull(propertyBag, nameof(propertyBag)); if (propertyBag.StatusCode != HttpStatusCode.OK) { return(Task.FromResult(true)); } if (!IsHtmlContent(propertyBag.ContentType)) { return(Task.FromResult(true)); } HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; using (MemoryStream ms = new MemoryStream(propertyBag.Response)) { Encoding documentEncoding = htmlDoc.DetectEncoding(ms); ms.Seek(0, SeekOrigin.Begin); if (!documentEncoding.IsNull()) { htmlDoc.Load(ms, documentEncoding, true); } else { htmlDoc.Load(ms, true); } } string originalContent = htmlDoc.DocumentNode.OuterHtml; if (HasTextStripRules || HasSubstitutionRules) { string content = StripText(originalContent); content = Substitute(content, propertyBag.Step); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } propertyBag["HtmlDoc"].Value = htmlDoc; HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title"); // Extract Title if (!nodes.IsNull()) { propertyBag.Title = string.Join(";", nodes. Select(n => n.InnerText). ToArray()).Trim(); } // Extract Meta Data nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]"); if (!nodes.IsNull()) { propertyBag["Meta"].Value = ( from entry in nodes let name = entry.Attributes["name"] let content = entry.Attributes["content"] where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty() select $"{name.Value}: {content.Value}").ToArray(); } // Extract text propertyBag.Text = htmlDoc.ExtractText().Trim(); if (HasLinkStripRules || HasTextStripRules) { string content = StripLinks(originalContent); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); // Extract Head Base nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]"); if (!nodes.IsNull()) { baseUrl = nodes .Select(entry => new { entry, href = entry.Attributes["href"] }) .Where(arg => !arg.href.IsNull() && !arg.href.Value.IsNullOrEmpty() && Uri.IsWellFormedUriString(arg.href.Value, UriKind.RelativeOrAbsolute)) .Select(t => { if (Uri.IsWellFormedUriString(t.href.Value, UriKind.Relative)) { return(propertyBag.ResponseUri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped) + t.href.Value); } return(t.href.Value); }) .AddToEnd(baseUrl) .FirstOrDefault(); } // Extract Links DocumentWithLinks links = htmlDoc.GetLinks(); foreach (string link in links.Links.Union(links.References)) { if (link.IsNullOrEmpty()) { continue; } string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link); string normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } crawler.Crawl(new Uri(normalizedLink), propertyBag); } return(Task.FromResult(true)); }
public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag) { AspectF.Define .NotNull(crawler, nameof(crawler)) .NotNull(propertyBag, nameof(propertyBag)); if (propertyBag.StatusCode != HttpStatusCode.OK) { return Task.FromResult(true); } if (!IsHtmlContent(propertyBag.ContentType)) { return Task.FromResult(true); } HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; using (MemoryStream ms = new MemoryStream(propertyBag.Response)) { Encoding documentEncoding = htmlDoc.DetectEncoding(ms); ms.Seek(0, SeekOrigin.Begin); if (!documentEncoding.IsNull()) { htmlDoc.Load(ms, documentEncoding, true); } else { htmlDoc.Load(ms, true); } } string originalContent = htmlDoc.DocumentNode.OuterHtml; if (HasTextStripRules || HasSubstitutionRules) { string content = StripText(originalContent); content = Substitute(content, propertyBag.Step); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } propertyBag["HtmlDoc"].Value = htmlDoc; HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title"); // Extract Title if (!nodes.IsNull()) { propertyBag.Title = string.Join(";", nodes. Select(n => n.InnerText). ToArray()).Trim(); } // Extract Meta Data nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]"); if (!nodes.IsNull()) { propertyBag["Meta"].Value = ( from entry in nodes let name = entry.Attributes["name"] let content = entry.Attributes["content"] where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty() select $"{name.Value}: {content.Value}").ToArray(); } // Extract text propertyBag.Text = htmlDoc.ExtractText().Trim(); if (HasLinkStripRules || HasTextStripRules) { string content = StripLinks(originalContent); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); // Extract Head Base nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]"); if (!nodes.IsNull()) { baseUrl = nodes .Select(entry => new {entry, href = entry.Attributes["href"]}) .Where(arg => !arg.href.IsNull() && !arg.href.Value.IsNullOrEmpty() && Uri.IsWellFormedUriString(arg.href.Value, UriKind.RelativeOrAbsolute)) .Select(t => { if (Uri.IsWellFormedUriString(t.href.Value, UriKind.Relative)) { return propertyBag.ResponseUri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped) + t.href.Value; } return t.href.Value; }) .AddToEnd(baseUrl) .FirstOrDefault(); } // Extract Links DocumentWithLinks links = htmlDoc.GetLinks(); foreach (string link in links.Links.Union(links.References)) { if (link.IsNullOrEmpty()) { continue; } string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link); string normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } crawler.Crawl(new Uri(normalizedLink), propertyBag); } return Task.FromResult(true); }
static void CrawlSite(ICrawler Crawler) { Crawler.Crawl(); }