public async Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
		{
			if (propertyBag.StatusCode != HttpStatusCode.OK
				|| propertyBag.Response == null)
			{
				return true;
			}

			string extension = MapContentTypeToExtension(propertyBag.ContentType);
			if (extension.IsNullOrEmpty())
			{
				return true;
			}

			propertyBag.Title = propertyBag.Step.Uri.PathAndQuery;
			using (TempFile temp = new TempFile())
			{
				temp.FileName += "." + extension;
				using (FileStream fs = new FileStream(temp.FileName, FileMode.Create, FileAccess.Write, FileShare.Read, 0x1000))
				{
					await fs.WriteAsync(propertyBag.Response, 0, propertyBag.Response.Length);
				}

				ParserContext context = new ParserContext(temp.FileName);
				ITextParser parser = ParserFactory.CreateText(context);
				propertyBag.Text = parser.Parse();
			}

			return true;
		}
		public async Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
		{
			FlurlClient client = propertyBag.Step.Uri.ToString()
				.ConfigureHttpClient(httpClient => { });
			client.Settings.AfterCall += httpCall =>
			{
				propertyBag[FlurlHttpCallPropertyName].Value = httpCall;
				propertyBag.DownloadTime = httpCall.Duration.GetValueOrDefault();
			};

			HttpResponseMessage getResult = await client.GetAsync();
			propertyBag.CharacterSet = getResult.Content.Headers.ContentType.CharSet;
			propertyBag.ContentEncoding = string.Join(";", getResult.Content.Headers.ContentEncoding);
			propertyBag.ContentType = getResult.Content.Headers.ContentType.MediaType;
			propertyBag.Headers = getResult.Content.Headers.ToDictionary(x => x.Key, x => x.Value);
			propertyBag.LastModified = getResult.Headers.Date.GetValueOrDefault(DateTimeOffset.UtcNow).DateTime;
			propertyBag.Method = "GET";
			//propertyBag.ProtocolVersion = getResult.;
			//propertyBag.ResponseUri = getResult.Headers.Server;
			propertyBag.Server = string.Join(";", getResult.Headers.Server.Select(x => x.Product.ToString()));
			propertyBag.StatusCode = getResult.StatusCode;
			propertyBag.StatusDescription = getResult.StatusCode.ToString();
			propertyBag.Response = await getResult.Content.ReadAsByteArrayAsync();
			return true;
		}
Пример #3
0
 /// <summary>
 /// Analytics Runner
 /// </summary>
 /// <param name="modelAnalyzer">model Analyzer</param>
 /// <returns></returns>
 public AnalyticsRunner(ICrawler crawler, ModelAnalyzer modelAnalyzer = null)
 {
     if (crawler == null)
     {
         throw new ArgumentNullException("crawler");
     }
     this.crawler = crawler;
     this.modelAnalyzer = modelAnalyzer ?? new ArticlesSiteAnalyzer();
 }
		public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
		{
			if (_predicate != null)
			{
				return Task.FromResult(_predicate(crawler, propertyBag));
			}

			if (_predicate2 != null)
			{
				return _predicate2(crawler, propertyBag);
			}

			return Task.FromResult(true);
		}
Пример #5
0
        private async Task UpdateMetaInformationAsync(IBlog blog)
        {
            ICrawler crawler = null;

            try
            {
                crawler = _crawlerFactory.GetCrawler(blog, new Progress <DownloadProgress>(), new PauseToken(), new CancellationToken());

                await crawler.UpdateMetaInformationAsync();
            }
            finally
            {
                crawler?.Dispose();
            }
        }
Пример #6
0
        private async Task CheckStatusOfBlogsAsync(SemaphoreSlim semaphoreSlim, IBlog blog)
        {
            await semaphoreSlim.WaitAsync();

            try
            {
                ICrawler crawler = crawlerFactory.GetCrawler(blog, new CancellationToken(), new PauseToken(),
                                                             new Progress <DownloadProgress>());
                await crawler.IsBlogOnlineAsync();
            }
            finally
            {
                semaphoreSlim.Release();
            }
        }
Пример #7
0
        public static ICrawler CreateMine(string url)
        {
            ICrawler result = null;
            string   name   = "{0}"._Format(DefaultConfiguration.GetAppSetting("ApplicationName", "UNKNOWN"));

            if (Instances.ContainsKey(name))
            {
                result = Instances[name];
            }
            else
            {
                result = Create(name, url);
            }

            return(result);
        }
Пример #8
0
        public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK ||
                propertyBag.Response == null ||
                propertyBag.Response.Length == 0)
            {
                return(Task.FromResult(true));
            }

            if (!IsXmlContent(propertyBag.ContentType))
            {
                return(Task.FromResult(true));
            }

            using (MemoryStream ms = new MemoryStream(propertyBag.Response))
            {
                XDocument mydoc = XDocument.Load(ms);
                if (mydoc.Root == null)
                {
                    return(Task.FromResult(true));
                }

                XName qualifiedName           = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9");
                IEnumerable <string> urlNodes =
                    from e in mydoc.Descendants(qualifiedName)
                    where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase)
                    select e.Value;

                foreach (string url in urlNodes)
                {
                    // add new crawler steps
                    string baseUrl        = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
                    string decodedLink    = ExtendedHtmlUtility.HtmlEntityDecode(url);
                    string normalizedLink = NormalizeLink(baseUrl, decodedLink);
                    if (normalizedLink.IsNullOrEmpty())
                    {
                        continue;
                    }

                    propertyBag["PropertyBagKeyOriginalUrl"].Value         = url;
                    propertyBag["PropertyBagKeyOriginalReferrerUrl"].Value = propertyBag.ResponseUri;
                    crawler.Crawl(new Uri(normalizedLink), propertyBag);
                }
            }

            return(Task.FromResult(true));
        }
Пример #9
0
        public virtual async Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag)
        {
            // Get text from previous pipeline step
            var text = propertyBag.Text;

            if (this.HasTextStripRules)
            {
                text = this.StripText(text);
            }

            if (text.IsNullOrEmpty())
            {
                return;
            }

            if (this.HasLinkStripRules)
            {
                text = this.StripLinks(text);
            }

            // Find links
            var matches = s_LinkRegex.Value.Matches(text);

            foreach (var match in matches.Cast <Match>().Where(m => m.Success))
            {
                var link = match.Value;
                if (link.IsNullOrEmpty())
                {
                    continue;
                }

                var baseUrl        = propertyBag.ResponseUri.GetLeftPath();
                var normalizedLink = link.NormalizeUrl(baseUrl);
                if (normalizedLink.IsNullOrEmpty())
                {
                    continue;
                }

                // Add new step to crawler
                await crawler.AddStepAsync(new Uri(normalizedLink), propertyBag.Step.Depth + 1,
                                           propertyBag.Step, new Dictionary <string, object>
                {
                    { Resources.PropertyBagKeyOriginalUrl, new Uri(link) },
                    { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri }
                }).ConfigureAwait(false);
            }
        }
Пример #10
0
        public async Task <bool> Process(ICrawler crawler, PropertyBag propertyBag)
        {
            string robotsHttpUrl = string.IsNullOrEmpty(_searchPath)
                                ? $"{propertyBag.Step.Uri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped).ToLowerInvariant()}/robots.txt"
                                : $"{propertyBag.Step.Uri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped).ToLowerInvariant()}" + _searchPath;

            RobotsTxt.Robots robots;
            if (!_robotsInfo.TryGetValue(robotsHttpUrl, out robots))
            {
                _logger.Verbose("Downloading robots.txt file from {@0}", robotsHttpUrl);
                string robotsContext = null;
                try
                {
                    robotsContext = await _httpClient.GetStringAsync(robotsHttpUrl);
                }
                catch (WebException)
                {
                }
                catch (ProtocolViolationException)
                {
                }
                catch (HttpRequestException)
                {
                }

                robots = new RobotsTxt.Robots(robotsContext ?? string.Empty);
                _robotsInfo.Add(robotsHttpUrl, robots);
            }

            if (!robots.HasRules)
            {
                return(true);
            }

            long crawlDelay = robots.CrawlDelay(propertyBag.UserAgent);

            if (crawlDelay > 0)
            {
                await Task.Delay((int)crawlDelay);
            }

            bool result = robots.IsPathAllowed(propertyBag.UserAgent, propertyBag.Step.Uri.ToString());

            propertyBag[RobotsIsPathAllowedPropertyName].Name  = nameof(RobotsPipelineStep);
            propertyBag[RobotsIsPathAllowedPropertyName].Value = result;
            return(result);
        }
Пример #11
0
		public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
		{
			if (propertyBag.StatusCode != HttpStatusCode.OK
				|| propertyBag.Response == null
				|| propertyBag.Response.Length == 0)
			{
				return Task.FromResult(true);
			}

			if (!IsXmlContent(propertyBag.ContentType))
			{
				return Task.FromResult(true);
			}

			using (MemoryStream ms = new MemoryStream(propertyBag.Response))
			{
				XDocument mydoc = XDocument.Load(ms);
				if (mydoc.Root == null)
				{
					return Task.FromResult(true);
				}

				XName qualifiedName = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9");
				IEnumerable<string> urlNodes =
					from e in mydoc.Descendants(qualifiedName)
					where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase)
					select e.Value;

				foreach (string url in urlNodes)
				{
					// add new crawler steps
					string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);
					string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(url);
					string normalizedLink = NormalizeLink(baseUrl, decodedLink);
					if (normalizedLink.IsNullOrEmpty())
					{
						continue;
					}

					propertyBag["PropertyBagKeyOriginalUrl"].Value = url;
					propertyBag["PropertyBagKeyOriginalReferrerUrl"].Value = propertyBag.ResponseUri;
					crawler.Crawl(new Uri(normalizedLink), propertyBag);
				}
			}

			return Task.FromResult(true);
		}
Пример #12
0
 private async Task CheckBlogsOnlineStatus()
 {
     if (shellService.Settings.CheckOnlineStatusAtStartup)
     {
         await Task.Run(async() =>
         {
             IEnumerable <IBlog> blogs = managerService.BlogFiles;
             foreach (IBlog blog in blogs)
             {
                 ICrawler crawler = CrawlerFactory.GetCrawler(blog, new CancellationToken(), new PauseToken(),
                                                              new Progress <DownloadProgress>(), shellService,
                                                              crawlerService, managerService);
                 await crawler.IsBlogOnlineAsync();
             }
         });
     }
 }
Пример #13
0
 private async Task CheckStatusAsync()
 {
     await Task.Run(async() =>
     {
         var semaphoreSlim         = new SemaphoreSlim(25);
         IEnumerable <IBlog> blogs = selectionService.SelectedBlogFiles.ToArray();
         IEnumerable <Task> tasks  = blogs.Select(async blog =>
         {
             await semaphoreSlim.WaitAsync();
             ICrawler crawler = crawlerFactory.GetCrawler(blog, new CancellationToken(), new PauseToken(),
                                                          new Progress <DownloadProgress>());
             await crawler.IsBlogOnlineAsync();
             semaphoreSlim.Release();
         });
         await Task.WhenAll(tasks);
     });
 }
		/// <summary>
		/// </summary>
		/// The crawler.
		/// <param name="crawler"></param>
		/// <param name="propertyBag">
		///     The property bag.
		/// </param>
		public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
		{
			AspectF.Define
				.NotNull(propertyBag, "propertyBag");

			string text = propertyBag.Text;
			if (!text.IsNullOrEmpty())
			{
				MatchCollection matches = s_emailRegex.Value.Matches(text);
				propertyBag["Email"].Value = matches
					.Cast<Match>()
					.Select(match => match.Value)
					.ToArray();
			}

			return Task.FromResult(true);
		}
Пример #15
0
        private async Task AddBlogAsync(string blogUrl)
        {
            if (string.IsNullOrEmpty(blogUrl))
            {
                blogUrl = crawlerService.NewBlogUrl;
            }

            // TODO: Dependency, not SOLID!
            IBlog blog;

            try
            {
                blog = BlogFactory.GetBlog(blogUrl, Path.Combine(shellService.Settings.DownloadLocation, "Index"));
            }
            catch (ArgumentException)
            {
                return;
            }

            blog = settingsService.TransferGlobalSettingsToBlog(blog);
            ICrawler crawler = CrawlerFactory.GetCrawler(blog.BlogType, new CancellationToken(), new PauseToken(), new Progress <DownloadProgress>(), shellService, crawlerService, blog);
            await crawler.IsBlogOnlineAsync();

            if (CheckIfTumblrPrivateBlog(blog))
            {
                blog    = PromoteTumblrBlogToPrivateBlog(blog);
                crawler = CrawlerFactory.GetCrawler(blog.BlogType, new CancellationToken(), new PauseToken(), new Progress <DownloadProgress>(), shellService, crawlerService, blog);
            }

            await crawler.UpdateMetaInformationAsync();

            lock (lockObject)
            {
                if (managerService.BlogFiles.Any(blogs => blogs.Name.Equals(blog.Name) && blogs.BlogType.Equals(blog.BlogType)))
                {
                    shellService.ShowError(null, Resources.BlogAlreadyExist, blog.Name);
                    return;
                }

                if (blog.Save())
                {
                    QueueOnDispatcher.CheckBeginInvokeOnUI((Action)(() => managerService.BlogFiles.Add(blog)));
                }
            }
        }
Пример #16
0
		public async Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
		{
			string robotsHttpUrl = string.IsNullOrEmpty(_searchPath)
				? $"{propertyBag.Step.Uri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped).ToLowerInvariant()}/robots.txt"
				: $"{propertyBag.Step.Uri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped).ToLowerInvariant()}" + _searchPath;

			RobotsTxt.Robots robots;
			if (!_robotsInfo.TryGetValue(robotsHttpUrl, out robots))
			{
				_logger.Verbose("Downloading robots.txt file from {@0}", robotsHttpUrl);
				string robotsContext = null;
				try
				{
					robotsContext = await _httpClient.GetStringAsync(robotsHttpUrl);
				}
				catch (WebException)
				{
				}
				catch (ProtocolViolationException)
				{
				}
				catch (HttpRequestException)
				{
				}

				robots = new RobotsTxt.Robots(robotsContext ?? string.Empty);
				_robotsInfo.Add(robotsHttpUrl, robots);
			}

			if (!robots.HasRules)
			{
				return true;
			}

			long crawlDelay = robots.CrawlDelay(propertyBag.UserAgent);
			if (crawlDelay > 0)
			{
				await Task.Delay((int) crawlDelay);
			}

			bool result = robots.IsPathAllowed(propertyBag.UserAgent, propertyBag.Step.Uri.ToString());
			propertyBag[RobotsIsPathAllowedPropertyName].Name = nameof(RobotsPipelineStep);
			propertyBag[RobotsIsPathAllowedPropertyName].Value = result;
			return result;
		}
        /// <summary>
        /// </summary>
        /// The crawler.
        /// <param name="crawler"></param>
        /// <param name="propertyBag">
        ///     The property bag.
        /// </param>
        public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define
            .NotNull(propertyBag, "propertyBag");

            string text = propertyBag.Text;

            if (!text.IsNullOrEmpty())
            {
                MatchCollection matches = s_emailRegex.Value.Matches(text);
                propertyBag["Email"].Value = matches
                                             .Cast <Match>()
                                             .Select(match => match.Value)
                                             .ToArray();
            }

            return(Task.FromResult(true));
        }
Пример #18
0
        static void Main(string[] args)
        {
            var svc = new BikeShareWriteService();
            var crawlers = new ICrawler[] {
                new MontrealBixiCrawler(svc),
                new TorontoBixiCrawler(svc),
                new CapitalBikeShareCrawler(svc),
                new HubwayCrawler(svc),
                new NiceRideMNCrawler(svc),
                new VelovCrawler(svc),
                new BarclaysCycleHireCrawler(svc),
                new BicingCrawler(svc),
            };

            Parallel.ForEach(crawlers, x => x.Run());

            System.Console.ReadLine();
        }
Пример #19
0
        private async Task StartSiteSpecificDownloaderAsync(QueueListItem queueListItem, PauseToken pt, CancellationToken ct)
        {
            IBlog blog = queueListItem.Blog;

            blog.Dirty = true;
            ProgressThrottler <DownloadProgress> progress = SetupThrottledQueueListProgress(queueListItem);

            ICrawler crawler = null;

            try
            {
                crawler = _crawlerFactory.GetCrawler(blog, progress, pt, ct);
                queueListItem.InterruptionRequested += crawler.InterruptionRequestedEventHandler;
                await crawler.CrawlAsync();

                blog.UpdateProgress(false);
            }
            catch (Exception e)
            {
                if (!ct.IsCancellationRequested)
                {
                    Logger.Error("CrawlerController.StartSiteSpecificDownloaderAsync: {0}", e);
                }
            }
            finally
            {
                if (crawler != null)
                {
                    queueListItem.InterruptionRequested -= crawler.InterruptionRequestedEventHandler;
                }
                crawler?.Dispose();
            }

            Monitor.Enter(_lockObject);
            QueueOnDispatcher.CheckBeginInvokeOnUI(() => _crawlerService.RemoveActiveItem(queueListItem));
            Monitor.Exit(_lockObject);

            if (!ct.IsCancellationRequested)
            {
                Monitor.Enter(_lockObject);
                QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(queueListItem));
                Monitor.Exit(_lockObject);
            }
        }
Пример #20
0
        private async Task AddBlogAsync(string blogUrl)
        {
            if (string.IsNullOrEmpty(blogUrl))
            {
                blogUrl = crawlerService.NewBlogUrl;
            }

            IBlog blog;

            try
            {
                blog = blogFactory.GetBlog(blogUrl, Path.Combine(shellService.Settings.DownloadLocation, "Index"));
            }
            catch (ArgumentException)
            {
                return;
            }

            if (blog.GetType() == typeof(TumblrBlog) && await tumblrBlogDetector.IsHiddenTumblrBlog(blog.Url))
            {
                blog = PromoteTumblrBlogToHiddenBlog(blog);
            }

            lock (lockObject)
            {
                if (managerService.BlogFiles.Any(blogs => blogs.Name.Equals(blog.Name) && blogs.BlogType.Equals(blog.BlogType)))
                {
                    shellService.ShowError(null, Resources.BlogAlreadyExist, blog.Name);
                    return;
                }

                if (blog.Save())
                {
                    AddToManager(blog);
                }
            }

            blog = settingsService.TransferGlobalSettingsToBlog(blog);
            ICrawler crawler = crawlerFactory.GetCrawler(blog, new CancellationToken(), new PauseToken(),
                                                         new Progress <DownloadProgress>());
            await crawler.UpdateMetaInformationAsync();
        }
Пример #21
0
 static void Main(string[] args)
 {
     var logger = new ConsoleLogger();
     try
     {
         var loader = new HtmlDocumentLoader();
         var repository = new CrawlerRepository();
         //var crawlers = new ICrawler[] { new RabotaUaCrawler(logger), new CareersStackoverfowComCrawler(logger) };
         var crawlers = new ICrawler[] { new CareersStackoverfowComCrawler(logger) };
         foreach (var crawler in crawlers)
         {
             crawler.Crawl(loader, repository);                           
         }
     }
     catch (Exception e)
     {
         logger.Log("FAILED exception caught in Main() method. Exception message: " + e.Message);
         logger.Log(e.StackTrace);
     }
 }
Пример #22
0
 private async Task CheckBlogsOnlineStatusAsync()
 {
     if (shellService.Settings.CheckOnlineStatusOnStartup)
     {
         await Task.Run(async() =>
         {
             var semaphoreSlim         = new SemaphoreSlim(25);
             IEnumerable <IBlog> blogs = managerService.BlogFiles;
             IEnumerable <Task> tasks  = blogs.Select(async blog =>
             {
                 await semaphoreSlim.WaitAsync();
                 ICrawler crawler = CrawlerFactory.GetCrawler(blog, new CancellationToken(), new PauseToken(),
                                                              new Progress <DownloadProgress>(), shellService, crawlerService, managerService);
                 await crawler.IsBlogOnlineAsync();
                 semaphoreSlim.Release();
             });
             await Task.WhenAll(tasks);
         });
     }
 }
        /// <summary>
        /// </summary>
        /// <param name="crawler">
        /// The crawler.
        /// </param>
        /// <param name="propertyBag">
        /// The property bag.
        /// </param>
        public Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
            NotNull(crawler, "crawler").
            NotNull(propertyBag, "propertyBag");

            var text = propertyBag.Text;

            if (string.IsNullOrEmpty(text))
            {
                return(Task.CompletedTask);
            }

            var matches = this.emailRegex.Value.Matches(text);

            propertyBag["Email"].Value = matches.Cast <Match>().
                                         Select(match => match.Value).
                                         Join(";");
            return(Task.CompletedTask);
        }
		public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
		{
			if (propertyBag.StatusCode == HttpStatusCode.OK
				&& IsTextContent(propertyBag.ContentType))
			{
				string content = Encoding.UTF8.GetString(propertyBag.Response);
				propertyBag.Title = propertyBag.Step.Uri.ToString();
				propertyBag.Text = content.Trim();
				MatchCollection urlMatches = _urlMatcher.Matches(propertyBag.Text);
				foreach (Match urlMatch in urlMatches)
				{
					Uri uri;
					if (Uri.TryCreate(urlMatch.Value, UriKind.Absolute, out uri))
					{
						crawler.Crawl(uri, propertyBag);
					}
				}
			}

			return Task.FromResult(true);
		}
Пример #25
0
        private async Task <int> ProcessCoreAsync(ICrawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return(0);
            }

            if (!IsTextContent(propertyBag.ContentType))
            {
                return(0);
            }

            using (var reader = propertyBag.GetResponse())
            {
                var content = await reader.ReadToEndAsync().ConfigureAwait(false);

                propertyBag.Text = content.Trim();
            }

            return(0);
        }
        public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode == HttpStatusCode.OK &&
                IsTextContent(propertyBag.ContentType))
            {
                string content = Encoding.UTF8.GetString(propertyBag.Response);
                propertyBag.Title = propertyBag.Step.Uri.ToString();
                propertyBag.Text  = content.Trim();
                MatchCollection urlMatches = _urlMatcher.Matches(propertyBag.Text);
                foreach (Match urlMatch in urlMatches)
                {
                    Uri uri;
                    if (Uri.TryCreate(urlMatch.Value, UriKind.Absolute, out uri))
                    {
                        crawler.Crawl(uri, propertyBag);
                    }
                }
            }

            return(Task.FromResult(true));
        }
Пример #27
0
        static void Main(string[] args)
        {
            var logger = new ConsoleLogger();

            try
            {
                var loader     = new HtmlDocumentLoader();
                var repository = new CrawlerRepository();
                //var crawlers = new ICrawler[] { new RabotaUaCrawler(logger), new CareersStackoverfowComCrawler(logger) };
                var crawlers = new ICrawler[] { new CareersStackoverfowComCrawler(logger) };
                foreach (var crawler in crawlers)
                {
                    crawler.Crawl(loader, repository);
                }
            }
            catch (Exception e)
            {
                logger.Log("FAILED exception caught in Main() method. Exception message: " + e.Message);
                logger.Log(e.StackTrace);
            }
        }
		public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
		{
			AspectF.Define.
				NotNull(crawler, "crawler").
				NotNull(propertyBag, "propertyBag");

			string content = propertyBag.Text;
			if (content.IsNullOrEmpty())
			{
				return Task.FromResult(true);
			}

			IEnumerable<Tuple<LanguageInfo, double>> languages = _identifier.Identify(content);
			Tuple<LanguageInfo, double> mostCertainLanguage = languages.FirstOrDefault();
			if (mostCertainLanguage != null)
			{
				propertyBag[LanguagePropertyName].Value = mostCertainLanguage.Item1.Iso639_3;
			}

			return Task.FromResult(true);
		}
Пример #29
0
        private async Task StartSiteSpecificDownloaderAsync(QueueListItem queueListItem, CancellationToken ct, PauseToken pt)
        {
            IBlog blog = queueListItem.Blog;

            blog.Dirty = true;
            ProgressThrottler <DownloadProgress> progress = SetupThrottledQueueListProgress(queueListItem);

            ICrawler crawler = crawlerFactory.GetCrawler(blog, ct, pt, progress);
            await crawler.CrawlAsync();

            Monitor.Enter(lockObject);
            QueueOnDispatcher.CheckBeginInvokeOnUI(() => crawlerService.RemoveActiveItem(queueListItem));
            Monitor.Exit(lockObject);

            if (!ct.IsCancellationRequested)
            {
                Monitor.Enter(lockObject);
                QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(queueListItem));
                Monitor.Exit(lockObject);
            }
        }
Пример #30
0
        public Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
            NotNull(crawler, "crawler").
            NotNull(propertyBag, "propertyBag");

            if (propertyBag.StatusCode != HttpStatusCode.OK)
            {
                return(Task.CompletedTask);
            }

            if (!IsPdfContent(propertyBag.ContentType))
            {
                return(Task.CompletedTask);
            }

            using (var input = propertyBag.GetResponse())
            {
                var pdfReader = new PdfReader(input);
                try
                {
                    if (pdfReader.Info.TryGetValue("Title", out var title))
                    {
                        propertyBag.Title = Convert.ToString(title, CultureInfo.InvariantCulture).Trim();
                    }

                    var textExtractionStrategy = new SimpleTextExtractionStrategy();
                    propertyBag.Text = Enumerable.Range(1, pdfReader.NumberOfPages).
                                       Select(pageNumber => PdfTextExtractor.GetTextFromPage(pdfReader, pageNumber, textExtractionStrategy)).
                                       Join(Environment.NewLine);
                }
                finally
                {
                    pdfReader.Close();
                }
            }

            return(Task.CompletedTask);
        }
		public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
		{
			if (propertyBag.StatusCode != HttpStatusCode.OK
				|| propertyBag.Response == null)
			{
				return Task.FromResult(true);
			}

			PDDocument doc = null;
			try
			{
				doc = PDDocument.load(new ByteArrayInputStream(propertyBag.Response));
				PDFTextStripper stripper = new PDFTextStripper();
				propertyBag.Text = stripper.getText(doc);
			}
			finally
			{
				doc?.close();
			}

			return Task.FromResult(true);
		}
Пример #32
0
        private async Task LoadLibrary()
        {
            Logger.Verbose("ManagerController.LoadLibrary:Start");
            managerService.BlogFiles.Clear();
            string path = Path.Combine(shellService.Settings.DownloadLocation, "Index");

            try
            {
                if (Directory.Exists(path))
                {
                    {
                        IReadOnlyList <IBlog> files = await GetIBlogsAsync(path);

                        foreach (IBlog file in files)
                        {
                            managerService.BlogFiles.Add(file);
                        }

                        BlogManagerFinishedLoading?.Invoke(this, EventArgs.Empty);

                        if (shellService.Settings.CheckOnlineStatusAtStartup)
                        {
                            foreach (IBlog blog in files)
                            {
                                ICrawler downloader = CrawlerFactory.GetCrawler(blog, new CancellationToken(), new PauseToken(), new Progress <DownloadProgress>(), shellService,
                                                                                crawlerService, managerService);
                                await downloader.IsBlogOnlineAsync();
                            }
                        }
                    }
                }
            }
            catch (Exception ex)
            {
                Logger.Verbose("ManagerController:LoadLibrary: {0}", ex);
                shellService.ShowError(ex, Resources.CouldNotLoadLibrary, ex.Data["Filename"]);
            }
            Logger.Verbose("ManagerController.LoadLibrary:End");
        }
Пример #33
0
        private async Task CheckStatusOfBlogsAsync(SemaphoreSlim semaphoreSlim, IBlog blog)
        {
            await semaphoreSlim.WaitAsync();

            ICrawler crawler = null;

            try
            {
                crawler = _crawlerFactory.GetCrawler(blog, new Progress <DownloadProgress>(), new PauseToken(), new CancellationToken());
                await crawler.IsBlogOnlineAsync();
            }
            finally
            {
                crawler?.Dispose();
                try
                {
                    semaphoreSlim.Release();
                }
                catch (ObjectDisposedException)
                { }
            }
        }
        public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag)
        {
            if (propertyBag.StatusCode != HttpStatusCode.OK ||
                propertyBag.Response == null)
            {
                return(Task.FromResult(true));
            }

            PDDocument doc = null;

            try
            {
                doc = PDDocument.load(new ByteArrayInputStream(propertyBag.Response));
                PDFTextStripper stripper = new PDFTextStripper();
                propertyBag.Text = stripper.getText(doc);
            }
            finally
            {
                doc?.close();
            }

            return(Task.FromResult(true));
        }
Пример #35
0
        public MainControl()
        {
            _mainForm = new MainForm();
            _mainForm.Init(this);
            _testForm = new TestForm();
            _testForm.Init(this);
            _fetcherForm       = new FetcherForm();
            _tagManagementForm = new TagManagementForm();
            _tagManagementForm.Init(this);
            _viewerForm = new ViewerForm();
            _viewerForm.Init(this);
            _filterForm  = new FilterForm();
            _persistence = new FakePersistenceSimulator();
            _fileStorage = new HierarchyFileStorage();
            _crawler     = new SimpleCrawler();

            _fetcherDict        = new Dictionary <string, IFetcher>();
            _fetcherDict["skk"] = new SkkFetcher();
            _fetcherDict["skk"].Init(this);

            _retryIntervalPage  = RETRY_INTERVAL_PAGE;
            _retryIntervalIndex = RETRY_INTERVAL_INDEX;
        }
        public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag)
        {
            AspectF.Define.
            NotNull(crawler, "crawler").
            NotNull(propertyBag, "propertyBag");

            string content = propertyBag.Text;

            if (content.IsNullOrEmpty())
            {
                return(Task.FromResult(true));
            }

            IEnumerable <Tuple <LanguageInfo, double> > languages = _identifier.Identify(content);
            Tuple <LanguageInfo, double> mostCertainLanguage      = languages.FirstOrDefault();

            if (mostCertainLanguage != null)
            {
                propertyBag[LanguagePropertyName].Value = mostCertainLanguage.Item1.Iso639_3;
            }

            return(Task.FromResult(true));
        }
        public async Task <bool> Process(ICrawler crawler, PropertyBag propertyBag)
        {
            Stopwatch      sw      = Stopwatch.StartNew();
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(propertyBag.Step.Uri);

            request.Method = "GET";
            try
            {
                using (HttpWebResponse httpWebResponse = (HttpWebResponse)await request.GetResponseAsync())
                    using (Stream downloadStream = httpWebResponse.GetResponseStream())
                        using (MemoryStream ms = new MemoryStream())
                        {
                            if (downloadStream != null)
                            {
                                await downloadStream.CopyToAsync(ms);
                            }

                            sw.Stop();
                            HttpWebResponseToPropertyBag(httpWebResponse, propertyBag);
                            propertyBag.Response     = ms.ToArray();
                            propertyBag.DownloadTime = sw.Elapsed;
                        }
            }
            catch (WebException ex)
            {
                HttpWebResponse httpWebResponse = ex.Response as HttpWebResponse;
                HttpWebResponseToPropertyBag(httpWebResponse, propertyBag);
                propertyBag.DownloadTime = TimeSpan.MaxValue;
            }
            catch (ProtocolViolationException)
            {
                propertyBag.StatusCode   = HttpStatusCode.Forbidden;
                propertyBag.DownloadTime = TimeSpan.MaxValue;
            }

            return(true);
        }
Пример #38
0
        private async Task CheckStatusOfBlogsAsync(SemaphoreSlim semaphoreSlim, IBlog blog)
        {
            await semaphoreSlim.WaitAsync();

            ICrawler crawler = null;

            try
            {
                bool isHiddenTumblrBlog = false;
                if (blog.BlogType == BlogTypes.tumblr)
                {
                    isHiddenTumblrBlog = await _tumblrBlogDetector.IsHiddenTumblrBlogAsync(blog.Url);
                }
                if (isHiddenTumblrBlog)
                {
                    blog.BlogType = BlogTypes.tmblrpriv;
                }
                crawler = _crawlerFactory.GetCrawler(blog, new Progress <DownloadProgress>(), new PauseToken(), new CancellationToken());
                await crawler.IsBlogOnlineAsync();
            }
            catch (Exception ex)
            {
                Logger.Error("ManagerController.CheckStatusOfBlogsAsync: {0}", ex);
                _shellService.ShowError(ex, $"Online check for '{blog.Name}' failed: {ex.Message}");
                blog.Online = false;
            }
            finally
            {
                crawler?.Dispose();
                try
                {
                    semaphoreSlim.Release();
                }
                catch (ObjectDisposedException)
                { }
            }
        }
Пример #39
0
        /// <summary>
        /// </summary>
        /// <param name = "crawler">
        ///     The crawler.
        /// </param>
        /// <param name = "propertyBag">
        ///     The property bag.
        /// </param>
        public async Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag)
        {
            var contentCulture      = (CultureInfo)propertyBag["LanguageCulture"].Value;
            var cultureDisplayValue = "N/A";

            if (!contentCulture.IsNull())
            {
                cultureDisplayValue = contentCulture.DisplayName;
            }

            lock (this)
            {
                Console.Out.WriteLine(ConsoleColor.Gray, "Url: {0}", propertyBag.Step.Uri);
                Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent type: {0}", propertyBag.ContentType);
                Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent length: {0}",
                                      propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length);
                Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tDepth: {0}", propertyBag.Step.Depth);
                Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tCulture: {0}", cultureDisplayValue);
                Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThreadId: {0}", Thread.CurrentThread.ManagedThreadId);
                // Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThread Count: {0}", crawler.ThreadsInUse);
            }

            await Console.Out.WriteLineAsync();
        }
		public async Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
		{
			Stopwatch sw = Stopwatch.StartNew();
			HttpWebRequest request = (HttpWebRequest) WebRequest.Create(propertyBag.Step.Uri);
			request.Method = "GET";
			try
			{
				using (HttpWebResponse httpWebResponse = (HttpWebResponse) await request.GetResponseAsync())
				using (Stream downloadStream = httpWebResponse.GetResponseStream())
				using (MemoryStream ms = new MemoryStream())
				{
					if (downloadStream != null)
					{
						await downloadStream.CopyToAsync(ms);
					}

					sw.Stop();
					HttpWebResponseToPropertyBag(httpWebResponse, propertyBag);
					propertyBag.Response = ms.ToArray();
					propertyBag.DownloadTime = sw.Elapsed;
				}
			}
			catch (WebException ex)
			{
				HttpWebResponse httpWebResponse = ex.Response as HttpWebResponse;
				HttpWebResponseToPropertyBag(httpWebResponse, propertyBag);
				propertyBag.DownloadTime = TimeSpan.MaxValue;
			}
			catch (ProtocolViolationException)
			{
				propertyBag.StatusCode = HttpStatusCode.Forbidden;
				propertyBag.DownloadTime = TimeSpan.MaxValue;
			}

			return true;
		}
		public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag)
		{
			AspectF.Define
				.NotNull(crawler, nameof(crawler))
				.NotNull(propertyBag, nameof(propertyBag));

			if (propertyBag.StatusCode != HttpStatusCode.OK)
			{
				return Task.FromResult(true);
			}

			if (!IsHtmlContent(propertyBag.ContentType))
			{
				return Task.FromResult(true);
			}

			HtmlDocument htmlDoc = new HtmlDocument
			{
				OptionAddDebuggingAttributes = false,
				OptionAutoCloseOnEnd = true,
				OptionFixNestedTags = true,
				OptionReadEncoding = true
			};

			using (MemoryStream ms = new MemoryStream(propertyBag.Response))
			{
				Encoding documentEncoding = htmlDoc.DetectEncoding(ms);
				ms.Seek(0, SeekOrigin.Begin);
				if (!documentEncoding.IsNull())
				{
					htmlDoc.Load(ms, documentEncoding, true);
				}
				else
				{
					htmlDoc.Load(ms, true);
				}
			}

			string originalContent = htmlDoc.DocumentNode.OuterHtml;
			if (HasTextStripRules || HasSubstitutionRules)
			{
				string content = StripText(originalContent);
				content = Substitute(content, propertyBag.Step);
				using (TextReader tr = new StringReader(content))
				{
					htmlDoc.Load(tr);
				}
			}

			propertyBag["HtmlDoc"].Value = htmlDoc;

			HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title");
			// Extract Title
			if (!nodes.IsNull())
			{
				propertyBag.Title = string.Join(";", nodes.
					Select(n => n.InnerText).
					ToArray()).Trim();
			}

			// Extract Meta Data
			nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]");
			if (!nodes.IsNull())
			{
				propertyBag["Meta"].Value = (
					from entry in nodes
					let name = entry.Attributes["name"]
					let content = entry.Attributes["content"]
					where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty()
					select $"{name.Value}: {content.Value}").ToArray();
			}

			// Extract text
			propertyBag.Text = htmlDoc.ExtractText().Trim();
			if (HasLinkStripRules || HasTextStripRules)
			{
				string content = StripLinks(originalContent);
				using (TextReader tr = new StringReader(content))
				{
					htmlDoc.Load(tr);
				}
			}

			string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path);

			// Extract Head Base
			nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]");
			if (!nodes.IsNull())
			{
				baseUrl = nodes
					.Select(entry => new {entry, href = entry.Attributes["href"]})
					.Where(arg => !arg.href.IsNull()
						&& !arg.href.Value.IsNullOrEmpty()
						&& Uri.IsWellFormedUriString(arg.href.Value, UriKind.RelativeOrAbsolute))
					.Select(t =>
					{
						if (Uri.IsWellFormedUriString(t.href.Value, UriKind.Relative))
						{
							return propertyBag.ResponseUri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped) + t.href.Value;
						}

						return t.href.Value;
					})
					.AddToEnd(baseUrl)
					.FirstOrDefault();
			}

			// Extract Links
			DocumentWithLinks links = htmlDoc.GetLinks();
			foreach (string link in links.Links.Union(links.References))
			{
				if (link.IsNullOrEmpty())
				{
					continue;
				}

				string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link);
				string normalizedLink = NormalizeLink(baseUrl, decodedLink);
				if (normalizedLink.IsNullOrEmpty())
				{
					continue;
				}

				crawler.Crawl(new Uri(normalizedLink), propertyBag);
			}

			return Task.FromResult(true);
		}
Пример #42
0
 public Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag)
 {
     return(this.ProcessCoreAsync(crawler, propertyBag));
 }
Пример #43
0
 public RunCrawlerCommandHandler(ICrawler crawler, IEventRepository eventRepository)
 {
     _crawler         = crawler;
     _eventRepository = eventRepository;
 }
Пример #44
0
 public CrawlerResolver(IUserAgentService service) : base(service)
 {
     _crawler = GetCrawler();
 }