public async Task<bool> Process(ICrawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK || propertyBag.Response == null) { return true; } string extension = MapContentTypeToExtension(propertyBag.ContentType); if (extension.IsNullOrEmpty()) { return true; } propertyBag.Title = propertyBag.Step.Uri.PathAndQuery; using (TempFile temp = new TempFile()) { temp.FileName += "." + extension; using (FileStream fs = new FileStream(temp.FileName, FileMode.Create, FileAccess.Write, FileShare.Read, 0x1000)) { await fs.WriteAsync(propertyBag.Response, 0, propertyBag.Response.Length); } ParserContext context = new ParserContext(temp.FileName); ITextParser parser = ParserFactory.CreateText(context); propertyBag.Text = parser.Parse(); } return true; }
public async Task<bool> Process(ICrawler crawler, PropertyBag propertyBag) { FlurlClient client = propertyBag.Step.Uri.ToString() .ConfigureHttpClient(httpClient => { }); client.Settings.AfterCall += httpCall => { propertyBag[FlurlHttpCallPropertyName].Value = httpCall; propertyBag.DownloadTime = httpCall.Duration.GetValueOrDefault(); }; HttpResponseMessage getResult = await client.GetAsync(); propertyBag.CharacterSet = getResult.Content.Headers.ContentType.CharSet; propertyBag.ContentEncoding = string.Join(";", getResult.Content.Headers.ContentEncoding); propertyBag.ContentType = getResult.Content.Headers.ContentType.MediaType; propertyBag.Headers = getResult.Content.Headers.ToDictionary(x => x.Key, x => x.Value); propertyBag.LastModified = getResult.Headers.Date.GetValueOrDefault(DateTimeOffset.UtcNow).DateTime; propertyBag.Method = "GET"; //propertyBag.ProtocolVersion = getResult.; //propertyBag.ResponseUri = getResult.Headers.Server; propertyBag.Server = string.Join(";", getResult.Headers.Server.Select(x => x.Product.ToString())); propertyBag.StatusCode = getResult.StatusCode; propertyBag.StatusDescription = getResult.StatusCode.ToString(); propertyBag.Response = await getResult.Content.ReadAsByteArrayAsync(); return true; }
/// <summary> /// Analytics Runner /// </summary> /// <param name="modelAnalyzer">model Analyzer</param> /// <returns></returns> public AnalyticsRunner(ICrawler crawler, ModelAnalyzer modelAnalyzer = null) { if (crawler == null) { throw new ArgumentNullException("crawler"); } this.crawler = crawler; this.modelAnalyzer = modelAnalyzer ?? new ArticlesSiteAnalyzer(); }
public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag) { if (_predicate != null) { return Task.FromResult(_predicate(crawler, propertyBag)); } if (_predicate2 != null) { return _predicate2(crawler, propertyBag); } return Task.FromResult(true); }
private async Task UpdateMetaInformationAsync(IBlog blog) { ICrawler crawler = null; try { crawler = _crawlerFactory.GetCrawler(blog, new Progress <DownloadProgress>(), new PauseToken(), new CancellationToken()); await crawler.UpdateMetaInformationAsync(); } finally { crawler?.Dispose(); } }
private async Task CheckStatusOfBlogsAsync(SemaphoreSlim semaphoreSlim, IBlog blog) { await semaphoreSlim.WaitAsync(); try { ICrawler crawler = crawlerFactory.GetCrawler(blog, new CancellationToken(), new PauseToken(), new Progress <DownloadProgress>()); await crawler.IsBlogOnlineAsync(); } finally { semaphoreSlim.Release(); } }
public static ICrawler CreateMine(string url) { ICrawler result = null; string name = "{0}"._Format(DefaultConfiguration.GetAppSetting("ApplicationName", "UNKNOWN")); if (Instances.ContainsKey(name)) { result = Instances[name]; } else { result = Create(name, url); } return(result); }
public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK || propertyBag.Response == null || propertyBag.Response.Length == 0) { return(Task.FromResult(true)); } if (!IsXmlContent(propertyBag.ContentType)) { return(Task.FromResult(true)); } using (MemoryStream ms = new MemoryStream(propertyBag.Response)) { XDocument mydoc = XDocument.Load(ms); if (mydoc.Root == null) { return(Task.FromResult(true)); } XName qualifiedName = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9"); IEnumerable <string> urlNodes = from e in mydoc.Descendants(qualifiedName) where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase) select e.Value; foreach (string url in urlNodes) { // add new crawler steps string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(url); string normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } propertyBag["PropertyBagKeyOriginalUrl"].Value = url; propertyBag["PropertyBagKeyOriginalReferrerUrl"].Value = propertyBag.ResponseUri; crawler.Crawl(new Uri(normalizedLink), propertyBag); } } return(Task.FromResult(true)); }
public virtual async Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag) { // Get text from previous pipeline step var text = propertyBag.Text; if (this.HasTextStripRules) { text = this.StripText(text); } if (text.IsNullOrEmpty()) { return; } if (this.HasLinkStripRules) { text = this.StripLinks(text); } // Find links var matches = s_LinkRegex.Value.Matches(text); foreach (var match in matches.Cast <Match>().Where(m => m.Success)) { var link = match.Value; if (link.IsNullOrEmpty()) { continue; } var baseUrl = propertyBag.ResponseUri.GetLeftPath(); var normalizedLink = link.NormalizeUrl(baseUrl); if (normalizedLink.IsNullOrEmpty()) { continue; } // Add new step to crawler await crawler.AddStepAsync(new Uri(normalizedLink), propertyBag.Step.Depth + 1, propertyBag.Step, new Dictionary <string, object> { { Resources.PropertyBagKeyOriginalUrl, new Uri(link) }, { Resources.PropertyBagKeyOriginalReferrerUrl, propertyBag.ResponseUri } }).ConfigureAwait(false); } }
public async Task <bool> Process(ICrawler crawler, PropertyBag propertyBag) { string robotsHttpUrl = string.IsNullOrEmpty(_searchPath) ? $"{propertyBag.Step.Uri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped).ToLowerInvariant()}/robots.txt" : $"{propertyBag.Step.Uri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped).ToLowerInvariant()}" + _searchPath; RobotsTxt.Robots robots; if (!_robotsInfo.TryGetValue(robotsHttpUrl, out robots)) { _logger.Verbose("Downloading robots.txt file from {@0}", robotsHttpUrl); string robotsContext = null; try { robotsContext = await _httpClient.GetStringAsync(robotsHttpUrl); } catch (WebException) { } catch (ProtocolViolationException) { } catch (HttpRequestException) { } robots = new RobotsTxt.Robots(robotsContext ?? string.Empty); _robotsInfo.Add(robotsHttpUrl, robots); } if (!robots.HasRules) { return(true); } long crawlDelay = robots.CrawlDelay(propertyBag.UserAgent); if (crawlDelay > 0) { await Task.Delay((int)crawlDelay); } bool result = robots.IsPathAllowed(propertyBag.UserAgent, propertyBag.Step.Uri.ToString()); propertyBag[RobotsIsPathAllowedPropertyName].Name = nameof(RobotsPipelineStep); propertyBag[RobotsIsPathAllowedPropertyName].Value = result; return(result); }
public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK || propertyBag.Response == null || propertyBag.Response.Length == 0) { return Task.FromResult(true); } if (!IsXmlContent(propertyBag.ContentType)) { return Task.FromResult(true); } using (MemoryStream ms = new MemoryStream(propertyBag.Response)) { XDocument mydoc = XDocument.Load(ms); if (mydoc.Root == null) { return Task.FromResult(true); } XName qualifiedName = XName.Get("loc", "http://www.sitemaps.org/schemas/sitemap/0.9"); IEnumerable<string> urlNodes = from e in mydoc.Descendants(qualifiedName) where !e.Value.IsNullOrEmpty() && e.Value.StartsWith("http://", StringComparison.OrdinalIgnoreCase) select e.Value; foreach (string url in urlNodes) { // add new crawler steps string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(url); string normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } propertyBag["PropertyBagKeyOriginalUrl"].Value = url; propertyBag["PropertyBagKeyOriginalReferrerUrl"].Value = propertyBag.ResponseUri; crawler.Crawl(new Uri(normalizedLink), propertyBag); } } return Task.FromResult(true); }
private async Task CheckBlogsOnlineStatus() { if (shellService.Settings.CheckOnlineStatusAtStartup) { await Task.Run(async() => { IEnumerable <IBlog> blogs = managerService.BlogFiles; foreach (IBlog blog in blogs) { ICrawler crawler = CrawlerFactory.GetCrawler(blog, new CancellationToken(), new PauseToken(), new Progress <DownloadProgress>(), shellService, crawlerService, managerService); await crawler.IsBlogOnlineAsync(); } }); } }
private async Task CheckStatusAsync() { await Task.Run(async() => { var semaphoreSlim = new SemaphoreSlim(25); IEnumerable <IBlog> blogs = selectionService.SelectedBlogFiles.ToArray(); IEnumerable <Task> tasks = blogs.Select(async blog => { await semaphoreSlim.WaitAsync(); ICrawler crawler = crawlerFactory.GetCrawler(blog, new CancellationToken(), new PauseToken(), new Progress <DownloadProgress>()); await crawler.IsBlogOnlineAsync(); semaphoreSlim.Release(); }); await Task.WhenAll(tasks); }); }
/// <summary> /// </summary> /// The crawler. /// <param name="crawler"></param> /// <param name="propertyBag"> /// The property bag. /// </param> public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag) { AspectF.Define .NotNull(propertyBag, "propertyBag"); string text = propertyBag.Text; if (!text.IsNullOrEmpty()) { MatchCollection matches = s_emailRegex.Value.Matches(text); propertyBag["Email"].Value = matches .Cast<Match>() .Select(match => match.Value) .ToArray(); } return Task.FromResult(true); }
private async Task AddBlogAsync(string blogUrl) { if (string.IsNullOrEmpty(blogUrl)) { blogUrl = crawlerService.NewBlogUrl; } // TODO: Dependency, not SOLID! IBlog blog; try { blog = BlogFactory.GetBlog(blogUrl, Path.Combine(shellService.Settings.DownloadLocation, "Index")); } catch (ArgumentException) { return; } blog = settingsService.TransferGlobalSettingsToBlog(blog); ICrawler crawler = CrawlerFactory.GetCrawler(blog.BlogType, new CancellationToken(), new PauseToken(), new Progress <DownloadProgress>(), shellService, crawlerService, blog); await crawler.IsBlogOnlineAsync(); if (CheckIfTumblrPrivateBlog(blog)) { blog = PromoteTumblrBlogToPrivateBlog(blog); crawler = CrawlerFactory.GetCrawler(blog.BlogType, new CancellationToken(), new PauseToken(), new Progress <DownloadProgress>(), shellService, crawlerService, blog); } await crawler.UpdateMetaInformationAsync(); lock (lockObject) { if (managerService.BlogFiles.Any(blogs => blogs.Name.Equals(blog.Name) && blogs.BlogType.Equals(blog.BlogType))) { shellService.ShowError(null, Resources.BlogAlreadyExist, blog.Name); return; } if (blog.Save()) { QueueOnDispatcher.CheckBeginInvokeOnUI((Action)(() => managerService.BlogFiles.Add(blog))); } } }
public async Task<bool> Process(ICrawler crawler, PropertyBag propertyBag) { string robotsHttpUrl = string.IsNullOrEmpty(_searchPath) ? $"{propertyBag.Step.Uri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped).ToLowerInvariant()}/robots.txt" : $"{propertyBag.Step.Uri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped).ToLowerInvariant()}" + _searchPath; RobotsTxt.Robots robots; if (!_robotsInfo.TryGetValue(robotsHttpUrl, out robots)) { _logger.Verbose("Downloading robots.txt file from {@0}", robotsHttpUrl); string robotsContext = null; try { robotsContext = await _httpClient.GetStringAsync(robotsHttpUrl); } catch (WebException) { } catch (ProtocolViolationException) { } catch (HttpRequestException) { } robots = new RobotsTxt.Robots(robotsContext ?? string.Empty); _robotsInfo.Add(robotsHttpUrl, robots); } if (!robots.HasRules) { return true; } long crawlDelay = robots.CrawlDelay(propertyBag.UserAgent); if (crawlDelay > 0) { await Task.Delay((int) crawlDelay); } bool result = robots.IsPathAllowed(propertyBag.UserAgent, propertyBag.Step.Uri.ToString()); propertyBag[RobotsIsPathAllowedPropertyName].Name = nameof(RobotsPipelineStep); propertyBag[RobotsIsPathAllowedPropertyName].Value = result; return result; }
/// <summary> /// </summary> /// The crawler. /// <param name="crawler"></param> /// <param name="propertyBag"> /// The property bag. /// </param> public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag) { AspectF.Define .NotNull(propertyBag, "propertyBag"); string text = propertyBag.Text; if (!text.IsNullOrEmpty()) { MatchCollection matches = s_emailRegex.Value.Matches(text); propertyBag["Email"].Value = matches .Cast <Match>() .Select(match => match.Value) .ToArray(); } return(Task.FromResult(true)); }
static void Main(string[] args) { var svc = new BikeShareWriteService(); var crawlers = new ICrawler[] { new MontrealBixiCrawler(svc), new TorontoBixiCrawler(svc), new CapitalBikeShareCrawler(svc), new HubwayCrawler(svc), new NiceRideMNCrawler(svc), new VelovCrawler(svc), new BarclaysCycleHireCrawler(svc), new BicingCrawler(svc), }; Parallel.ForEach(crawlers, x => x.Run()); System.Console.ReadLine(); }
private async Task StartSiteSpecificDownloaderAsync(QueueListItem queueListItem, PauseToken pt, CancellationToken ct) { IBlog blog = queueListItem.Blog; blog.Dirty = true; ProgressThrottler <DownloadProgress> progress = SetupThrottledQueueListProgress(queueListItem); ICrawler crawler = null; try { crawler = _crawlerFactory.GetCrawler(blog, progress, pt, ct); queueListItem.InterruptionRequested += crawler.InterruptionRequestedEventHandler; await crawler.CrawlAsync(); blog.UpdateProgress(false); } catch (Exception e) { if (!ct.IsCancellationRequested) { Logger.Error("CrawlerController.StartSiteSpecificDownloaderAsync: {0}", e); } } finally { if (crawler != null) { queueListItem.InterruptionRequested -= crawler.InterruptionRequestedEventHandler; } crawler?.Dispose(); } Monitor.Enter(_lockObject); QueueOnDispatcher.CheckBeginInvokeOnUI(() => _crawlerService.RemoveActiveItem(queueListItem)); Monitor.Exit(_lockObject); if (!ct.IsCancellationRequested) { Monitor.Enter(_lockObject); QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(queueListItem)); Monitor.Exit(_lockObject); } }
private async Task AddBlogAsync(string blogUrl) { if (string.IsNullOrEmpty(blogUrl)) { blogUrl = crawlerService.NewBlogUrl; } IBlog blog; try { blog = blogFactory.GetBlog(blogUrl, Path.Combine(shellService.Settings.DownloadLocation, "Index")); } catch (ArgumentException) { return; } if (blog.GetType() == typeof(TumblrBlog) && await tumblrBlogDetector.IsHiddenTumblrBlog(blog.Url)) { blog = PromoteTumblrBlogToHiddenBlog(blog); } lock (lockObject) { if (managerService.BlogFiles.Any(blogs => blogs.Name.Equals(blog.Name) && blogs.BlogType.Equals(blog.BlogType))) { shellService.ShowError(null, Resources.BlogAlreadyExist, blog.Name); return; } if (blog.Save()) { AddToManager(blog); } } blog = settingsService.TransferGlobalSettingsToBlog(blog); ICrawler crawler = crawlerFactory.GetCrawler(blog, new CancellationToken(), new PauseToken(), new Progress <DownloadProgress>()); await crawler.UpdateMetaInformationAsync(); }
static void Main(string[] args) { var logger = new ConsoleLogger(); try { var loader = new HtmlDocumentLoader(); var repository = new CrawlerRepository(); //var crawlers = new ICrawler[] { new RabotaUaCrawler(logger), new CareersStackoverfowComCrawler(logger) }; var crawlers = new ICrawler[] { new CareersStackoverfowComCrawler(logger) }; foreach (var crawler in crawlers) { crawler.Crawl(loader, repository); } } catch (Exception e) { logger.Log("FAILED exception caught in Main() method. Exception message: " + e.Message); logger.Log(e.StackTrace); } }
private async Task CheckBlogsOnlineStatusAsync() { if (shellService.Settings.CheckOnlineStatusOnStartup) { await Task.Run(async() => { var semaphoreSlim = new SemaphoreSlim(25); IEnumerable <IBlog> blogs = managerService.BlogFiles; IEnumerable <Task> tasks = blogs.Select(async blog => { await semaphoreSlim.WaitAsync(); ICrawler crawler = CrawlerFactory.GetCrawler(blog, new CancellationToken(), new PauseToken(), new Progress <DownloadProgress>(), shellService, crawlerService, managerService); await crawler.IsBlogOnlineAsync(); semaphoreSlim.Release(); }); await Task.WhenAll(tasks); }); } }
/// <summary> /// </summary> /// <param name="crawler"> /// The crawler. /// </param> /// <param name="propertyBag"> /// The property bag. /// </param> public Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); var text = propertyBag.Text; if (string.IsNullOrEmpty(text)) { return(Task.CompletedTask); } var matches = this.emailRegex.Value.Matches(text); propertyBag["Email"].Value = matches.Cast <Match>(). Select(match => match.Value). Join(";"); return(Task.CompletedTask); }
public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode == HttpStatusCode.OK && IsTextContent(propertyBag.ContentType)) { string content = Encoding.UTF8.GetString(propertyBag.Response); propertyBag.Title = propertyBag.Step.Uri.ToString(); propertyBag.Text = content.Trim(); MatchCollection urlMatches = _urlMatcher.Matches(propertyBag.Text); foreach (Match urlMatch in urlMatches) { Uri uri; if (Uri.TryCreate(urlMatch.Value, UriKind.Absolute, out uri)) { crawler.Crawl(uri, propertyBag); } } } return Task.FromResult(true); }
private async Task <int> ProcessCoreAsync(ICrawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK) { return(0); } if (!IsTextContent(propertyBag.ContentType)) { return(0); } using (var reader = propertyBag.GetResponse()) { var content = await reader.ReadToEndAsync().ConfigureAwait(false); propertyBag.Text = content.Trim(); } return(0); }
public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode == HttpStatusCode.OK && IsTextContent(propertyBag.ContentType)) { string content = Encoding.UTF8.GetString(propertyBag.Response); propertyBag.Title = propertyBag.Step.Uri.ToString(); propertyBag.Text = content.Trim(); MatchCollection urlMatches = _urlMatcher.Matches(propertyBag.Text); foreach (Match urlMatch in urlMatches) { Uri uri; if (Uri.TryCreate(urlMatch.Value, UriKind.Absolute, out uri)) { crawler.Crawl(uri, propertyBag); } } } return(Task.FromResult(true)); }
public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); string content = propertyBag.Text; if (content.IsNullOrEmpty()) { return Task.FromResult(true); } IEnumerable<Tuple<LanguageInfo, double>> languages = _identifier.Identify(content); Tuple<LanguageInfo, double> mostCertainLanguage = languages.FirstOrDefault(); if (mostCertainLanguage != null) { propertyBag[LanguagePropertyName].Value = mostCertainLanguage.Item1.Iso639_3; } return Task.FromResult(true); }
private async Task StartSiteSpecificDownloaderAsync(QueueListItem queueListItem, CancellationToken ct, PauseToken pt) { IBlog blog = queueListItem.Blog; blog.Dirty = true; ProgressThrottler <DownloadProgress> progress = SetupThrottledQueueListProgress(queueListItem); ICrawler crawler = crawlerFactory.GetCrawler(blog, ct, pt, progress); await crawler.CrawlAsync(); Monitor.Enter(lockObject); QueueOnDispatcher.CheckBeginInvokeOnUI(() => crawlerService.RemoveActiveItem(queueListItem)); Monitor.Exit(lockObject); if (!ct.IsCancellationRequested) { Monitor.Enter(lockObject); QueueOnDispatcher.CheckBeginInvokeOnUI(() => QueueManager.RemoveItem(queueListItem)); Monitor.Exit(lockObject); } }
public Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); if (propertyBag.StatusCode != HttpStatusCode.OK) { return(Task.CompletedTask); } if (!IsPdfContent(propertyBag.ContentType)) { return(Task.CompletedTask); } using (var input = propertyBag.GetResponse()) { var pdfReader = new PdfReader(input); try { if (pdfReader.Info.TryGetValue("Title", out var title)) { propertyBag.Title = Convert.ToString(title, CultureInfo.InvariantCulture).Trim(); } var textExtractionStrategy = new SimpleTextExtractionStrategy(); propertyBag.Text = Enumerable.Range(1, pdfReader.NumberOfPages). Select(pageNumber => PdfTextExtractor.GetTextFromPage(pdfReader, pageNumber, textExtractionStrategy)). Join(Environment.NewLine); } finally { pdfReader.Close(); } } return(Task.CompletedTask); }
public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK || propertyBag.Response == null) { return Task.FromResult(true); } PDDocument doc = null; try { doc = PDDocument.load(new ByteArrayInputStream(propertyBag.Response)); PDFTextStripper stripper = new PDFTextStripper(); propertyBag.Text = stripper.getText(doc); } finally { doc?.close(); } return Task.FromResult(true); }
private async Task LoadLibrary() { Logger.Verbose("ManagerController.LoadLibrary:Start"); managerService.BlogFiles.Clear(); string path = Path.Combine(shellService.Settings.DownloadLocation, "Index"); try { if (Directory.Exists(path)) { { IReadOnlyList <IBlog> files = await GetIBlogsAsync(path); foreach (IBlog file in files) { managerService.BlogFiles.Add(file); } BlogManagerFinishedLoading?.Invoke(this, EventArgs.Empty); if (shellService.Settings.CheckOnlineStatusAtStartup) { foreach (IBlog blog in files) { ICrawler downloader = CrawlerFactory.GetCrawler(blog, new CancellationToken(), new PauseToken(), new Progress <DownloadProgress>(), shellService, crawlerService, managerService); await downloader.IsBlogOnlineAsync(); } } } } } catch (Exception ex) { Logger.Verbose("ManagerController:LoadLibrary: {0}", ex); shellService.ShowError(ex, Resources.CouldNotLoadLibrary, ex.Data["Filename"]); } Logger.Verbose("ManagerController.LoadLibrary:End"); }
private async Task CheckStatusOfBlogsAsync(SemaphoreSlim semaphoreSlim, IBlog blog) { await semaphoreSlim.WaitAsync(); ICrawler crawler = null; try { crawler = _crawlerFactory.GetCrawler(blog, new Progress <DownloadProgress>(), new PauseToken(), new CancellationToken()); await crawler.IsBlogOnlineAsync(); } finally { crawler?.Dispose(); try { semaphoreSlim.Release(); } catch (ObjectDisposedException) { } } }
public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag) { if (propertyBag.StatusCode != HttpStatusCode.OK || propertyBag.Response == null) { return(Task.FromResult(true)); } PDDocument doc = null; try { doc = PDDocument.load(new ByteArrayInputStream(propertyBag.Response)); PDFTextStripper stripper = new PDFTextStripper(); propertyBag.Text = stripper.getText(doc); } finally { doc?.close(); } return(Task.FromResult(true)); }
public MainControl() { _mainForm = new MainForm(); _mainForm.Init(this); _testForm = new TestForm(); _testForm.Init(this); _fetcherForm = new FetcherForm(); _tagManagementForm = new TagManagementForm(); _tagManagementForm.Init(this); _viewerForm = new ViewerForm(); _viewerForm.Init(this); _filterForm = new FilterForm(); _persistence = new FakePersistenceSimulator(); _fileStorage = new HierarchyFileStorage(); _crawler = new SimpleCrawler(); _fetcherDict = new Dictionary <string, IFetcher>(); _fetcherDict["skk"] = new SkkFetcher(); _fetcherDict["skk"].Init(this); _retryIntervalPage = RETRY_INTERVAL_PAGE; _retryIntervalIndex = RETRY_INTERVAL_INDEX; }
public Task <bool> Process(ICrawler crawler, PropertyBag propertyBag) { AspectF.Define. NotNull(crawler, "crawler"). NotNull(propertyBag, "propertyBag"); string content = propertyBag.Text; if (content.IsNullOrEmpty()) { return(Task.FromResult(true)); } IEnumerable <Tuple <LanguageInfo, double> > languages = _identifier.Identify(content); Tuple <LanguageInfo, double> mostCertainLanguage = languages.FirstOrDefault(); if (mostCertainLanguage != null) { propertyBag[LanguagePropertyName].Value = mostCertainLanguage.Item1.Iso639_3; } return(Task.FromResult(true)); }
public async Task <bool> Process(ICrawler crawler, PropertyBag propertyBag) { Stopwatch sw = Stopwatch.StartNew(); HttpWebRequest request = (HttpWebRequest)WebRequest.Create(propertyBag.Step.Uri); request.Method = "GET"; try { using (HttpWebResponse httpWebResponse = (HttpWebResponse)await request.GetResponseAsync()) using (Stream downloadStream = httpWebResponse.GetResponseStream()) using (MemoryStream ms = new MemoryStream()) { if (downloadStream != null) { await downloadStream.CopyToAsync(ms); } sw.Stop(); HttpWebResponseToPropertyBag(httpWebResponse, propertyBag); propertyBag.Response = ms.ToArray(); propertyBag.DownloadTime = sw.Elapsed; } } catch (WebException ex) { HttpWebResponse httpWebResponse = ex.Response as HttpWebResponse; HttpWebResponseToPropertyBag(httpWebResponse, propertyBag); propertyBag.DownloadTime = TimeSpan.MaxValue; } catch (ProtocolViolationException) { propertyBag.StatusCode = HttpStatusCode.Forbidden; propertyBag.DownloadTime = TimeSpan.MaxValue; } return(true); }
private async Task CheckStatusOfBlogsAsync(SemaphoreSlim semaphoreSlim, IBlog blog) { await semaphoreSlim.WaitAsync(); ICrawler crawler = null; try { bool isHiddenTumblrBlog = false; if (blog.BlogType == BlogTypes.tumblr) { isHiddenTumblrBlog = await _tumblrBlogDetector.IsHiddenTumblrBlogAsync(blog.Url); } if (isHiddenTumblrBlog) { blog.BlogType = BlogTypes.tmblrpriv; } crawler = _crawlerFactory.GetCrawler(blog, new Progress <DownloadProgress>(), new PauseToken(), new CancellationToken()); await crawler.IsBlogOnlineAsync(); } catch (Exception ex) { Logger.Error("ManagerController.CheckStatusOfBlogsAsync: {0}", ex); _shellService.ShowError(ex, $"Online check for '{blog.Name}' failed: {ex.Message}"); blog.Online = false; } finally { crawler?.Dispose(); try { semaphoreSlim.Release(); } catch (ObjectDisposedException) { } } }
/// <summary> /// </summary> /// <param name = "crawler"> /// The crawler. /// </param> /// <param name = "propertyBag"> /// The property bag. /// </param> public async Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag) { var contentCulture = (CultureInfo)propertyBag["LanguageCulture"].Value; var cultureDisplayValue = "N/A"; if (!contentCulture.IsNull()) { cultureDisplayValue = contentCulture.DisplayName; } lock (this) { Console.Out.WriteLine(ConsoleColor.Gray, "Url: {0}", propertyBag.Step.Uri); Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent type: {0}", propertyBag.ContentType); Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tContent length: {0}", propertyBag.Text.IsNull() ? 0 : propertyBag.Text.Length); Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tDepth: {0}", propertyBag.Step.Depth); Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tCulture: {0}", cultureDisplayValue); Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThreadId: {0}", Thread.CurrentThread.ManagedThreadId); // Console.Out.WriteLine(ConsoleColor.DarkGreen, "\tThread Count: {0}", crawler.ThreadsInUse); } await Console.Out.WriteLineAsync(); }
public async Task<bool> Process(ICrawler crawler, PropertyBag propertyBag) { Stopwatch sw = Stopwatch.StartNew(); HttpWebRequest request = (HttpWebRequest) WebRequest.Create(propertyBag.Step.Uri); request.Method = "GET"; try { using (HttpWebResponse httpWebResponse = (HttpWebResponse) await request.GetResponseAsync()) using (Stream downloadStream = httpWebResponse.GetResponseStream()) using (MemoryStream ms = new MemoryStream()) { if (downloadStream != null) { await downloadStream.CopyToAsync(ms); } sw.Stop(); HttpWebResponseToPropertyBag(httpWebResponse, propertyBag); propertyBag.Response = ms.ToArray(); propertyBag.DownloadTime = sw.Elapsed; } } catch (WebException ex) { HttpWebResponse httpWebResponse = ex.Response as HttpWebResponse; HttpWebResponseToPropertyBag(httpWebResponse, propertyBag); propertyBag.DownloadTime = TimeSpan.MaxValue; } catch (ProtocolViolationException) { propertyBag.StatusCode = HttpStatusCode.Forbidden; propertyBag.DownloadTime = TimeSpan.MaxValue; } return true; }
public Task<bool> Process(ICrawler crawler, PropertyBag propertyBag) { AspectF.Define .NotNull(crawler, nameof(crawler)) .NotNull(propertyBag, nameof(propertyBag)); if (propertyBag.StatusCode != HttpStatusCode.OK) { return Task.FromResult(true); } if (!IsHtmlContent(propertyBag.ContentType)) { return Task.FromResult(true); } HtmlDocument htmlDoc = new HtmlDocument { OptionAddDebuggingAttributes = false, OptionAutoCloseOnEnd = true, OptionFixNestedTags = true, OptionReadEncoding = true }; using (MemoryStream ms = new MemoryStream(propertyBag.Response)) { Encoding documentEncoding = htmlDoc.DetectEncoding(ms); ms.Seek(0, SeekOrigin.Begin); if (!documentEncoding.IsNull()) { htmlDoc.Load(ms, documentEncoding, true); } else { htmlDoc.Load(ms, true); } } string originalContent = htmlDoc.DocumentNode.OuterHtml; if (HasTextStripRules || HasSubstitutionRules) { string content = StripText(originalContent); content = Substitute(content, propertyBag.Step); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } propertyBag["HtmlDoc"].Value = htmlDoc; HtmlNodeCollection nodes = htmlDoc.DocumentNode.SelectNodes("//title"); // Extract Title if (!nodes.IsNull()) { propertyBag.Title = string.Join(";", nodes. Select(n => n.InnerText). ToArray()).Trim(); } // Extract Meta Data nodes = htmlDoc.DocumentNode.SelectNodes("//meta[@content and @name]"); if (!nodes.IsNull()) { propertyBag["Meta"].Value = ( from entry in nodes let name = entry.Attributes["name"] let content = entry.Attributes["content"] where !name.IsNull() && !name.Value.IsNullOrEmpty() && !content.IsNull() && !content.Value.IsNullOrEmpty() select $"{name.Value}: {content.Value}").ToArray(); } // Extract text propertyBag.Text = htmlDoc.ExtractText().Trim(); if (HasLinkStripRules || HasTextStripRules) { string content = StripLinks(originalContent); using (TextReader tr = new StringReader(content)) { htmlDoc.Load(tr); } } string baseUrl = propertyBag.ResponseUri.GetLeftPart(UriPartial.Path); // Extract Head Base nodes = htmlDoc.DocumentNode.SelectNodes("//head/base[@href]"); if (!nodes.IsNull()) { baseUrl = nodes .Select(entry => new {entry, href = entry.Attributes["href"]}) .Where(arg => !arg.href.IsNull() && !arg.href.Value.IsNullOrEmpty() && Uri.IsWellFormedUriString(arg.href.Value, UriKind.RelativeOrAbsolute)) .Select(t => { if (Uri.IsWellFormedUriString(t.href.Value, UriKind.Relative)) { return propertyBag.ResponseUri.GetComponents(UriComponents.SchemeAndServer, UriFormat.Unescaped) + t.href.Value; } return t.href.Value; }) .AddToEnd(baseUrl) .FirstOrDefault(); } // Extract Links DocumentWithLinks links = htmlDoc.GetLinks(); foreach (string link in links.Links.Union(links.References)) { if (link.IsNullOrEmpty()) { continue; } string decodedLink = ExtendedHtmlUtility.HtmlEntityDecode(link); string normalizedLink = NormalizeLink(baseUrl, decodedLink); if (normalizedLink.IsNullOrEmpty()) { continue; } crawler.Crawl(new Uri(normalizedLink), propertyBag); } return Task.FromResult(true); }
public Task ProcessAsync(ICrawler crawler, PropertyBag propertyBag) { return(this.ProcessCoreAsync(crawler, propertyBag)); }
public RunCrawlerCommandHandler(ICrawler crawler, IEventRepository eventRepository) { _crawler = crawler; _eventRepository = eventRepository; }
public CrawlerResolver(IUserAgentService service) : base(service) { _crawler = GetCrawler(); }