private void ProcessRSSFeed(Source rssFeed, List <RSS.FeedItem> rssFeedsWebLocationsToProcess) { // Check to see if the scraping is to be stopped if (!this.isServiceScraping) { return; } var rssItems = RSSCrawler.ProcessRSSFeed(rssFeed.URL, ConfigurationManager.AppSettings.RSSFeedHistoryRangeForScrapingInDays); if (rssItems != null) { foreach (var rssItem in rssItems) { // Check to see if the scraping is to be stopped if (!this.isServiceScraping) { break; } // Skip this item if it has been processed earlier if (rssItem.Published <= rssFeed.LastRunTime) { continue; } rssItem.SourceType = rssFeed.SourceType; rssItem.Category = rssFeed.Category; //rssItem.ProcessingTimeLimit = rssFeed.LastRunTime; // Check to see if only the RSS feed data is to be saved and not the actual data behing the feed item if (rssFeed.CrawlerType == CrawlerType.RSSContentNoLinkFollowUp) { var result = Analyzer.Common.Database.DatabaseService.GetInstance().AddtoWriteQueueAsync <Analyzer.Common.Database.DataItems.WebData>(rssItem.SourceType.ToString(), rssItem.ToWebData()); if (!result.Result) { Analyzer.Common.Logger.ExceptionLoggingService.Instance.WriteWebScrapingInformation("WARNING: Unable to add the rss item to the database queue: " + rssItem.Url + " from feed: " + rssFeed.URL); this.itemsFailedCounter++; } else { this.itemsProcessedCounter++; } } else { // Add the RSS item to be processed as a normal web page, follow the RSS item link rssFeedsWebLocationsToProcess.Add(rssItem); } } } else { Analyzer.Common.Logger.ExceptionLoggingService.Instance.WriteWebScrapingInformation("WARNING: Unable to process rss feed: " + rssFeed.URL); } rssFeed.LastRunTime = DateTime.Now; // avoid being banned or causing harm to the scraped site, being nice :) System.Threading.Thread.Sleep(Analyzer.Common.Configuration.ConfigurationManager.AppSettings.RSSFeedReaderWaitTimeInMilliseconds); }
private void bCrawlURL_Click(object sender, EventArgs e) { //foreach(var title in titles) // this.tbResult.Text += title; var rssItems = RSSCrawler.ProcessRSSFeed(this.tbURL.Text, 60); foreach (var rssItem in rssItems) { this.tbResult.Text += "\n" + Analyzer.WebCrawler.Web.WebCrawler.ProcessWordpressArticle(rssItem.Url).Result; } }