示例#1
0
        private void Crawl(CrawlablePage page)
        {
            var resOption = _crawler.Crawl(page);

            resOption.IfSome(res =>
            {
                _pageCrawledNotifier.Notify(res);
                _crawlResultSaver.Save(res);
                _crawlablePagesRepository.Delete(res.Url);
            });
        }
示例#2
0
        private void ScanNowSync()
        {
            foreach (var searchResult in _crawler.Crawl())
            {
                foreach (var fileId in searchResult.Files)
                {
                    FileToMachineHandler.StoreIntoLookUp(fileId, searchResult.FileLocation);
                }

                ReverseIndexHandler.StoreLookUp(searchResult.Word, searchResult.Files);
            }
        }
        public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK ||
                propertyBag.Response == null ||
                propertyBag.Response.Length == 0)
            {
                return(Task.FromResult(true));
            }

            if (!IsXmlContent(propertyBag.ContentType))
            {
                return(Task.FromResult(true));
            }

            using (MemoryStream ms = new MemoryStream(propertyBag.Response))
            {
                XDocument mydoc = XDocument.Load(ms);
                if (mydoc.Root == null)
                {
                    return(Task.FromResult(true));
                }

                XName qualifiedName           = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9");
                IEnumerable <string> urlNodes =
                    from e in mydoc.Descendants(qualifiedName)
                    where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase)
                    select e.Value;

                foreach (string url in urlNodes)
                {
                    // add new crawler steps
                    string baseUrl        = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                    string decodedLink    = ExtendedHtmlUtility.HtmlEntityDecode(url);
                    string normalizedLink = NormalizeLink(baseUrl, decodedLink);
                    if (normalizedLink.IsNullOrEmpty())
                    {
                        continue;
                    }

                    propertyBag["PropertyBagKeyOriginalUrl"].Value         = url;
                    propertyBag["PropertyBagKeyOriginalReferrerUrl"].Value = propertyBag.ResponseUri;
                    crawler.Crawl(new Uri(normalizedLink), propertyBag);
                }
            }

            return(Task.FromResult(true));
        }
示例#4
0
		public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
		{
			if (propertyBag.StatusCode != HttpStatusCode.OK
				|| propertyBag.Response == null
				|| propertyBag.Response.Length == 0)
			{
				return Task.FromResult(true);
			}

			if (!IsXmlContent(propertyBag.ContentType))
			{
				return Task.FromResult(true);
			}

			using (MemoryStream ms = new MemoryStream(propertyBag.Response))
			{
				XDocument mydoc = XDocument.Load(ms);
				if (mydoc.Root == null)
				{
					return Task.FromResult(true);
				}

				XName qualifiedName = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9");
				IEnumerable<string> urlNodes =
					from e in mydoc.Descendants(qualifiedName)
					where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase)
					select e.Value;

				foreach (string url in urlNodes)
				{
					// add new crawler steps
					string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
					string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(url);
					string normalizedLink = NormalizeLink(baseUrl, decodedLink);
					if (normalizedLink.IsNullOrEmpty())
					{
						continue;
					}

					propertyBag["PropertyBagKeyOriginalUrl"].Value = url;
					propertyBag["PropertyBagKeyOriginalReferrerUrl"].Value = propertyBag.ResponseUri;
					crawler.Crawl(new Uri(normalizedLink), propertyBag);
				}
			}

			return Task.FromResult(true);
		}
        public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode == HttpStatusCode.OK &&
                IsTextContent(propertyBag.ContentType))
            {
                string content = Encoding.UTF8.GetString(propertyBag.Response);
                propertyBag.Title = propertyBag.Step.Uri.ToString();
                propertyBag.Text  = content.Trim();
                MatchCollection urlMatches = _urlMatcher.Matches(propertyBag.Text);
                foreach (Match urlMatch in urlMatches)
                {
                    Uri uri;
                    if (Uri.TryCreate(urlMatch.Value, UriKind.Absolute, out uri))
                    {
                        crawler.Crawl(uri, propertyBag);
                    }
                }
            }

            return(Task.FromResult(true));
        }
		public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
		{
			if (propertyBag.StatusCode == HttpStatusCode.OK
				&& IsTextContent(propertyBag.ContentType))
			{
				string content = Encoding.UTF8.GetString(propertyBag.Response);
				propertyBag.Title = propertyBag.Step.Uri.ToString();
				propertyBag.Text = content.Trim();
				MatchCollection urlMatches = _urlMatcher.Matches(propertyBag.Text);
				foreach (Match urlMatch in urlMatches)
				{
					Uri uri;
					if (Uri.TryCreate(urlMatch.Value, UriKind.Absolute, out uri))
					{
						crawler.Crawl(uri, propertyBag);
					}
				}
			}

			return Task.FromResult(true);
		}
示例#7
0
        private async Task StartSiteSpecificDownloader(QueueListItem queueListItem, CancellationToken ct, PauseToken pt)
        {
            IBlog blog = queueListItem.Blog;

            blog.Dirty = true;
            ProgressThrottler <DownloadProgress> progress = SetupThrottledQueueListProgress(queueListItem);

            ICrawler crawler = CrawlerFactory.GetCrawler(blog.BlogType, ct, pt, progress, shellService, crawlerService, blog);
            await crawler.Crawl();

            if (ct.IsCancellationRequested)
            {
                Monitor.Enter(lockObject);
                QueueOnDispatcher.CheckBeginInvokeOnUI(() => crawlerService.RemoveActiveItem(queueListItem));
                Monitor.Exit(lockObject);
            }
            else
            {
                Monitor.Enter(lockObject);
                QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(queueListItem));
                QueueOnDispatcher.CheckBeginInvokeOnUI(() => crawlerService.RemoveActiveItem(queueListItem));
                Monitor.Exit(lockObject);
            }
        }
示例#8
0
        private static void Main()
        {
            InitConfig();
            _logger = new ConsoleLogger();
            InitCrawler();

            var watch = Stopwatch.StartNew();

            //var crawler = new HtmlCrawler(_logger, _depth, _baseUrl);
            _crawler.Crawl(0, _url);

            var pages = _crawler.GetPages();

            new Validator.Validator(new RestClient()).Validate(pages.Values);

            watch.Stop();

            LogCrawlErrors(pages, _logger);

            _logger.Log($"Found: {pages.Count}");
            _logger.Log("Search time: " + watch.Elapsed);

            SaveToJson(pages);
        }
        public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define
            .NotNull(crawler, nameof(crawler))
            .NotNull(propertyBag, nameof(propertyBag));

            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return(Task.FromResult(true));
            }

            if (!IsHtmlContent(propertyBag.ContentType))
            {
                return(Task.FromResult(true));
            }

            HtmlDocument htmlDoc = new HtmlDocument
            {
                OptionAddDebuggingAttributes = false,
                OptionAutoCloseOnEnd         = true,
                OptionFixNestedTags          = true,
                OptionReadEncoding           = true
            };

            using (MemoryStream ms = new MemoryStream(propertyBag.Response))
            {
                Encoding documentEncoding = htmlDoc.DetectEncoding(ms);
                ms.Seek(0, SeekOrigin.Begin);
                if (!documentEncoding.IsNull())
                {
                    htmlDoc.Load(ms, documentEncoding, true);
                }
                else
                {
                    htmlDoc.Load(ms, true);
                }
            }

            string originalContent = htmlDoc.DocumentNode.OuterHtml;

            if (HasTextStripRules || HasSubstitutionRules)
            {
                string content = StripText(originalContent);
                content = Substitute(content, propertyBag.Step);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            propertyBag["HtmlDoc"].Value = htmlDoc;

            HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");

            // Extract Title
            if (!nodes.IsNull())
            {
                propertyBag.Title = string.Join(";", nodes.
                                                Select(n => n.InnerText).
                                                ToArray()).Trim();
            }

            // Extract Meta Data
            nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
            if (!nodes.IsNull())
            {
                propertyBag["Meta"].Value = (
                    from entry in nodes
                    let name = entry.Attributes["name"]
                               let content = entry.Attributes["content"]
                                             where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
                                             select $"{name.Value}: {content.Value}").ToArray();
            }

            // Extract text
            propertyBag.Text = htmlDoc.ExtractText().Trim();
            if (HasLinkStripRules || HasTextStripRules)
            {
                string content = StripLinks(originalContent);
                using (TextReader tr = new StringReader(content))
                {
                    htmlDoc.Load(tr);
                }
            }

            string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);

            // Extract Head Base
            nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]");
            if (!nodes.IsNull())
            {
                baseUrl = nodes
                          .Select(entry => new { entry, href = entry.Attributes["href"] })
                          .Where(arg => !arg.href.IsNull() &&
                                 !arg.href.Value.IsNullOrEmpty() &&
                                 Uri.IsWellFormedUriString(arg.href.Value, UriKind.RelativeOrAbsolute))
                          .Select(t =>
                {
                    if (Uri.IsWellFormedUriString(t.href.Value, UriKind.Relative))
                    {
                        return(propertyBag.ResponseUri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped) + t.href.Value);
                    }

                    return(t.href.Value);
                })
                          .AddToEnd(baseUrl)
                          .FirstOrDefault();
            }

            // Extract Links
            DocumentWithLinks links = htmlDoc.GetLinks();

            foreach (string link in links.Links.Union(links.References))
            {
                if (link.IsNullOrEmpty())
                {
                    continue;
                }

                string decodedLink    = ExtendedHtmlUtility.HtmlEntityDecode(link);
                string normalizedLink = NormalizeLink(baseUrl, decodedLink);
                if (normalizedLink.IsNullOrEmpty())
                {
                    continue;
                }

                crawler.Crawl(new Uri(normalizedLink), propertyBag);
            }

            return(Task.FromResult(true));
        }
		public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
		{
			AspectF.Define
				.NotNull(crawler, nameof(crawler))
				.NotNull(propertyBag, nameof(propertyBag));

			if (propertyBag.StatusCode != HttpStatusCode.OK)
			{
				return Task.FromResult(true);
			}

			if (!IsHtmlContent(propertyBag.ContentType))
			{
				return Task.FromResult(true);
			}

			HtmlDocument htmlDoc = new HtmlDocument
			{
				OptionAddDebuggingAttributes = false,
				OptionAutoCloseOnEnd = true,
				OptionFixNestedTags = true,
				OptionReadEncoding = true
			};

			using (MemoryStream ms = new MemoryStream(propertyBag.Response))
			{
				Encoding documentEncoding = htmlDoc.DetectEncoding(ms);
				ms.Seek(0, SeekOrigin.Begin);
				if (!documentEncoding.IsNull())
				{
					htmlDoc.Load(ms, documentEncoding, true);
				}
				else
				{
					htmlDoc.Load(ms, true);
				}
			}

			string originalContent = htmlDoc.DocumentNode.OuterHtml;
			if (HasTextStripRules || HasSubstitutionRules)
			{
				string content = StripText(originalContent);
				content = Substitute(content, propertyBag.Step);
				using (TextReader tr = new StringReader(content))
				{
					htmlDoc.Load(tr);
				}
			}

			propertyBag["HtmlDoc"].Value = htmlDoc;

			HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");
			// Extract Title
			if (!nodes.IsNull())
			{
				propertyBag.Title = string.Join(";", nodes.
					Select(n => n.InnerText).
					ToArray()).Trim();
			}

			// Extract Meta Data
			nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
			if (!nodes.IsNull())
			{
				propertyBag["Meta"].Value = (
					from entry in nodes
					let name = entry.Attributes["name"]
					let content = entry.Attributes["content"]
					where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
					select $"{name.Value}: {content.Value}").ToArray();
			}

			// Extract text
			propertyBag.Text = htmlDoc.ExtractText().Trim();
			if (HasLinkStripRules || HasTextStripRules)
			{
				string content = StripLinks(originalContent);
				using (TextReader tr = new StringReader(content))
				{
					htmlDoc.Load(tr);
				}
			}

			string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);

			// Extract Head Base
			nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]");
			if (!nodes.IsNull())
			{
				baseUrl = nodes
					.Select(entry => new {entry, href = entry.Attributes["href"]})
					.Where(arg => !arg.href.IsNull()
						&& !arg.href.Value.IsNullOrEmpty()
						&& Uri.IsWellFormedUriString(arg.href.Value, UriKind.RelativeOrAbsolute))
					.Select(t =>
					{
						if (Uri.IsWellFormedUriString(t.href.Value, UriKind.Relative))
						{
							return propertyBag.ResponseUri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped) + t.href.Value;
						}

						return t.href.Value;
					})
					.AddToEnd(baseUrl)
					.FirstOrDefault();
			}

			// Extract Links
			DocumentWithLinks links = htmlDoc.GetLinks();
			foreach (string link in links.Links.Union(links.References))
			{
				if (link.IsNullOrEmpty())
				{
					continue;
				}

				string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
				string normalizedLink = NormalizeLink(baseUrl, decodedLink);
				if (normalizedLink.IsNullOrEmpty())
				{
					continue;
				}

				crawler.Crawl(new Uri(normalizedLink), propertyBag);
			}

			return Task.FromResult(true);
		}
示例#11
0
 static void CrawlSite(ICrawler Crawler)
 {
     Crawler.Crawl();
 }