Example #1
0
        private void ProcessRSSFeed(Source rssFeed, List <RSS.FeedItem> rssFeedsWebLocationsToProcess)
        {
            // Check to see if the scraping is to be stopped
            if (!this.isServiceScraping)
            {
                return;
            }

            var rssItems = RSSCrawler.ProcessRSSFeed(rssFeed.URL, ConfigurationManager.AppSettings.RSSFeedHistoryRangeForScrapingInDays);

            if (rssItems != null)
            {
                foreach (var rssItem in rssItems)
                {
                    // Check to see if the scraping is to be stopped
                    if (!this.isServiceScraping)
                    {
                        break;
                    }

                    // Skip this item if it has been processed earlier
                    if (rssItem.Published <= rssFeed.LastRunTime)
                    {
                        continue;
                    }

                    rssItem.SourceType = rssFeed.SourceType;
                    rssItem.Category   = rssFeed.Category;
                    //rssItem.ProcessingTimeLimit = rssFeed.LastRunTime;

                    // Check to see if only the RSS feed data is to be saved and not the actual data behing the feed item
                    if (rssFeed.CrawlerType == CrawlerType.RSSContentNoLinkFollowUp)
                    {
                        var result = Analyzer.Common.Database.DatabaseService.GetInstance().AddtoWriteQueueAsync <Analyzer.Common.Database.DataItems.WebData>(rssItem.SourceType.ToString(), rssItem.ToWebData());
                        if (!result.Result)
                        {
                            Analyzer.Common.Logger.ExceptionLoggingService.Instance.WriteWebScrapingInformation("WARNING: Unable to add the rss item to the database queue: " + rssItem.Url + " from feed: " + rssFeed.URL);
                            this.itemsFailedCounter++;
                        }
                        else
                        {
                            this.itemsProcessedCounter++;
                        }
                    }
                    else
                    {
                        // Add the RSS item to be processed as a normal web page, follow the RSS item link
                        rssFeedsWebLocationsToProcess.Add(rssItem);
                    }
                }
            }
            else
            {
                Analyzer.Common.Logger.ExceptionLoggingService.Instance.WriteWebScrapingInformation("WARNING: Unable to process rss feed: " + rssFeed.URL);
            }

            rssFeed.LastRunTime = DateTime.Now;
            // avoid being banned or causing harm to the scraped site, being nice :)
            System.Threading.Thread.Sleep(Analyzer.Common.Configuration.ConfigurationManager.AppSettings.RSSFeedReaderWaitTimeInMilliseconds);
        }
Example #2
0
        private void bCrawlURL_Click(object sender, EventArgs e)
        {
            //foreach(var title in titles)
            //    this.tbResult.Text += title;

            var rssItems = RSSCrawler.ProcessRSSFeed(this.tbURL.Text, 60);

            foreach (var rssItem in rssItems)
            {
                this.tbResult.Text += "\n" + Analyzer.WebCrawler.Web.WebCrawler.ProcessWordpressArticle(rssItem.Url).Result;
            }
        }