public static void CrawlRss()
        {
            var logFolder = $"{Path.Combine(EnvironmentHelper.GetApplicationRoot(), "../Logs/")}";

            if (!Directory.Exists(logFolder))
            {
                Directory.CreateDirectory(logFolder);
            }

            var existedLogFiles = Directory.GetFiles(logFolder);

            var logFileBank = new Dictionary <DateTime, string>();

            foreach (var file in existedLogFiles)
            {
                var dtFormatArr = Path.GetFileNameWithoutExtension(file).Split('-');
                if (dtFormatArr.Length > 0)
                {
                    var dtText = dtFormatArr[0];
                    if (DateTime.TryParseExact(dtText, LogDateTimeFormat, CultureInfo.InvariantCulture, DateTimeStyles.None, out DateTime dt))
                    {
                        logFileBank.Add(dt, file);
                    }
                }
            }

            foreach (var kv in logFileBank)
            {
                if (kv.Key < DateTime.Now.AddDays(-10)) // just keep 10 days nearest
                {
                    File.Delete(kv.Value);
                }
            }

            var logPath = $"{Path.Combine(logFolder, $"{DateTime.Now.ToString(LogDateTimeFormat)}-crawler.txt")}";

            var _logger = new MyLogger(logPath);

            var feedUrl = string.Empty;

            try
            {
                List <RssChannelRow> channels = channels = SimpleFeedlyDatabaseAccess.GetActiveChannels().OrderBy(x => x.Id).ToList();

                _logger.Info($"There are {channels.Count} active channels");

                //channels = channels.Take(5).ToList();

                var progressCounter = 0;
                foreach (var channel in channels)
                {
                    progressCounter++;

                    feedUrl = channel.Link;

                    if (string.IsNullOrWhiteSpace(feedUrl))
                    {
                        continue;
                    }

                    try
                    {
                        _logger.Info($"- [{progressCounter}/{channels.Count}] Fetching url {feedUrl}");
                        var feed = GetFeedsFromChannel(feedUrl, channel.RssCrawlerEngine, out RssCrawlerEngine usedEngine, out Exception fetchFeedError);

                        _logger.Info($"  - Nbr of feed items {feed?.Items?.Count ?? 0}");

                        // update default engine for channel
                        SimpleFeedlyDatabaseAccess.UpdateChannelDefaultEngine(channel.Id, fetchFeedError != null ? RssCrawlerEngine.CodeHollowFeedReader : usedEngine);

                        if (feed != null && feed?.Items != null)
                        {
                            var top10LatestItems = feed.Items;
                            //.OrderByDescending(x => x.PublishingDate)
                            //.Take(10)
                            //.ToList();

                            if (top10LatestItems.Count == 0)
                            {
                                continue;
                            }
                            else
                            {
                                SimpleFeedlyDatabaseAccess.DeleteAllFeedItemByChannelId(channel.Id);
                                _logger.Info($"  - Deleted old items");

                                var insertItems = new List <RssFeedItemRow>();

                                foreach (var fItem in top10LatestItems)
                                {
                                    if (!StringUtils.IsUrl(fItem.Link))
                                    {
                                        continue;
                                    }

                                    var feedItemKey = GenerateFeedItemKey(fItem);

                                    if (string.IsNullOrWhiteSpace(feedItemKey) || string.IsNullOrWhiteSpace(fItem.Link))
                                    {
                                        continue;
                                    }

                                    var feedItem = new RssFeedItemRow
                                    {
                                        Channel        = channel,
                                        FeedItemKey    = feedItemKey,
                                        Title          = string.IsNullOrWhiteSpace(fItem.Title) ? fItem.Link : fItem.Title,
                                        Link           = fItem.Link,
                                        Description    = fItem.Description,
                                        PublishingDate = fItem.PublishingDate,
                                        Author         = fItem.Author
                                    };

                                    var shrinkedTitle     = StringUtils.UnsignString(StringUtils.RemoveNonAlphaCharactersAndDigit(feedItem.Title)).ToLower();
                                    var shrinkedTitleHash = StringUtils.MD5Hash(shrinkedTitle);

                                    if (!SimpleFeedlyDatabaseAccess.IsBlackListWord(shrinkedTitleHash))
                                    {
                                        var channelDomainGroup = string.IsNullOrEmpty(channel.DomainGroup) ? channel.Link : channel.DomainGroup;

                                        if (!SimpleFeedlyDatabaseAccess.IsExistedFeedItem(channel.Id, channelDomainGroup, feedItem.FeedItemKey))
                                        {
                                            //var coverImageUrl = fItem.GetFeedCoverImage();
                                            //if (!string.IsNullOrWhiteSpace(coverImageUrl))
                                            //{
                                            //    feedItem.CoverImageUrl = coverImageUrl;
                                            //}

                                            insertItems.Add(feedItem);
                                        }
                                    }
                                }

                                SimpleFeedlyDatabaseAccess.InsertFeedItems(insertItems);

                                _logger.Info($"  - Inserted {insertItems.Count()} items");
                            }

                            SimpleFeedlyDatabaseAccess.UpdateChannelErrorStatus(channel.Id, false, null);
                            _logger.Info($"  - Updated status");
                        }
                        else
                        {
                            _logger.Info($"  - [NO ITEMS]");
                            SimpleFeedlyDatabaseAccess.UpdateChannelErrorStatus(channel.Id, true, fetchFeedError == null ? null : JsonConvert.SerializeObject(fetchFeedError));

                            if (fetchFeedError != null)
                            {
                                ErrorHandle(fetchFeedError, feedUrl);
                            }
                        }
                    }
                    catch (Exception err)
                    {
                        _logger.Error($"  - Got Error: {JsonConvert.SerializeObject(err, new JsonSerializerSettings { ReferenceLoopHandling = ReferenceLoopHandling.Ignore })}");

                        SimpleFeedlyDatabaseAccess.UpdateChannelErrorStatus(channel.Id, true, JsonConvert.SerializeObject(err));
                        ErrorHandle(err, feedUrl);
                    }
                }
            }
            catch (Exception ex)
            {
                _logger.Error($"  - [ERROR]: {JsonConvert.SerializeObject(ex, new JsonSerializerSettings { ReferenceLoopHandling = ReferenceLoopHandling.Ignore })}");

                ErrorHandle(ex, feedUrl);
            }

            _logger.Info($"Done!");
        }
예제 #2
0
        static void Main(string[] args)
        {
            var appRootPath = EnvironmentHelper.GetApplicationRoot();

            //WriteAllText(Path.Combine(appRootPath, "../", "README.md"), "Hello From Jin -- this file is auto committed " + DateTime.Now);

            RssCrawler.CrawlRss();

            string indexFilePath = Path.Combine(appRootPath, "../", "index.html");
            string indexContent  = File.ReadAllText(indexFilePath, Encoding.UTF8);

            //var regex = new Regex(@"<!-- RSSDATA:START -->[\n\r]+(.*?)[\n\r]+<!-- RSSDATA:END -->");
            //var match = regex.Match(indexContent);
            //var result = match.Groups[1].Value;

            var change = string.Empty; //$"Hello, this text is auto generated {DateTime.Now:yyy/MM/dd HH:mm:ss}";

            var feedItems = SimpleFeedlyDatabaseAccess.GetAllFeedItems();

            feedItems = feedItems.Where(x => !x.Channel.Title.Contains("medium", System.StringComparison.CurrentCultureIgnoreCase)).ToList(); // I hate medium rss

            ObjectId currentChannelId = null;

            var sb        = new StringBuilder();
            var sbChannel = new StringBuilder();

            var counter = 1;

            foreach (var item in feedItems)
            {
                var isNewChannel = item.Channel.Id != currentChannelId;
                currentChannelId = item.Channel.Id;

                // just get top 20 feed items
                if (counter > 20 && !isNewChannel)
                {
                    counter++;
                    continue;
                }

                if (isNewChannel)
                {
                    counter = 1;
                    if (sbChannel.Length > 0)          // has previous item
                    {
                        sbChannel.AppendLine("</ul>"); // div.row
                        sb.Append(sbChannel);
                        sbChannel = new StringBuilder();
                    }

                    sbChannel.AppendLine($"<h2 class='channel-title'># <a href='{item.Channel.Title}' target='_blank'>{item.Channel.Title}</a></h2>");
                    sbChannel.AppendLine("<ul class='feed-items'>"); // new row
                }

                //sbChannel.Append(@$"<div class='column'>
                //                <div class='card'>
                //                  <div class='feed-img-wrapper'><img src='{item.CoverImageUrl}' /></div>
                //                  <div class='feed-title'>{item.Title}</div>
                //                </div>
                //              </div>");

                sbChannel.Append(@$ "<li><a href='{item.Link}'>{item.Title}</a></li>");

                counter++;
            }

            sb.AppendLine("</ul>"); // div.row

            change = sb.ToString();

            //var newContent = Regex.Replace(indexContent, $"<!-- RSSDATA:START -->[\n\r]+(.*?)[\n\r]+<!-- RSSDATA:END -->", string.Format("<!-- RSSDATA:START -->\n{0}\n<!-- RSSDATA:END -->", change));
            var newContent = Regex.Replace(indexContent, $"<!-- RSSDATA:START -->(?:[^\n]*(\n+))+<!-- RSSDATA:END -->", string.Format("<!-- RSSDATA:START -->\n{0}\n<!-- RSSDATA:END -->", change));

            WriteAllText(indexFilePath, newContent);

            //string assemblyFolder = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location);
            //string crawlLogFolderPath= Path.Combine(assemblyFolder, "Logs");

            //if (!Directory.Exists(crawlLogFolderPath))
            //{
            //    Directory.CreateDirectory(crawlLogFolderPath);
            //}
            //string[] filePaths = Directory.GetFiles(crawlLogFolderPath);
            //foreach (var filename in filePaths)
            //{
            //    string targetFolderPath = Path.Combine(appRootPath, "..\\Logs\\");
            //    if (!Directory.Exists(targetFolderPath))
            //    {
            //        Directory.CreateDirectory(targetFolderPath);
            //    }

            //    var targetFilePath = Path.Combine(targetFolderPath, Path.GetFileName(filename));
            //    File.Copy(filename, targetFilePath, true);
            //}
        }