public static void CrawlRss() { var logFolder = $"{Path.Combine(EnvironmentHelper.GetApplicationRoot(), "../Logs/")}"; if (!Directory.Exists(logFolder)) { Directory.CreateDirectory(logFolder); } var existedLogFiles = Directory.GetFiles(logFolder); var logFileBank = new Dictionary <DateTime, string>(); foreach (var file in existedLogFiles) { var dtFormatArr = Path.GetFileNameWithoutExtension(file).Split('-'); if (dtFormatArr.Length > 0) { var dtText = dtFormatArr[0]; if (DateTime.TryParseExact(dtText, LogDateTimeFormat, CultureInfo.InvariantCulture, DateTimeStyles.None, out DateTime dt)) { logFileBank.Add(dt, file); } } } foreach (var kv in logFileBank) { if (kv.Key < DateTime.Now.AddDays(-10)) // just keep 10 days nearest { File.Delete(kv.Value); } } var logPath = $"{Path.Combine(logFolder, $"{DateTime.Now.ToString(LogDateTimeFormat)}-crawler.txt")}"; var _logger = new MyLogger(logPath); var feedUrl = string.Empty; try { List <RssChannelRow> channels = channels = SimpleFeedlyDatabaseAccess.GetActiveChannels().OrderBy(x => x.Id).ToList(); _logger.Info($"There are {channels.Count} active channels"); //channels = channels.Take(5).ToList(); var progressCounter = 0; foreach (var channel in channels) { progressCounter++; feedUrl = channel.Link; if (string.IsNullOrWhiteSpace(feedUrl)) { continue; } try { _logger.Info($"- [{progressCounter}/{channels.Count}] Fetching url {feedUrl}"); var feed = GetFeedsFromChannel(feedUrl, channel.RssCrawlerEngine, out RssCrawlerEngine usedEngine, out Exception fetchFeedError); _logger.Info($" - Nbr of feed items {feed?.Items?.Count ?? 0}"); // update default engine for channel SimpleFeedlyDatabaseAccess.UpdateChannelDefaultEngine(channel.Id, fetchFeedError != null ? RssCrawlerEngine.CodeHollowFeedReader : usedEngine); if (feed != null && feed?.Items != null) { var top10LatestItems = feed.Items; //.OrderByDescending(x => x.PublishingDate) //.Take(10) //.ToList(); if (top10LatestItems.Count == 0) { continue; } else { SimpleFeedlyDatabaseAccess.DeleteAllFeedItemByChannelId(channel.Id); _logger.Info($" - Deleted old items"); var insertItems = new List <RssFeedItemRow>(); foreach (var fItem in top10LatestItems) { if (!StringUtils.IsUrl(fItem.Link)) { continue; } var feedItemKey = GenerateFeedItemKey(fItem); if (string.IsNullOrWhiteSpace(feedItemKey) || string.IsNullOrWhiteSpace(fItem.Link)) { continue; } var feedItem = new RssFeedItemRow { Channel = channel, FeedItemKey = feedItemKey, Title = string.IsNullOrWhiteSpace(fItem.Title) ? fItem.Link : fItem.Title, Link = fItem.Link, Description = fItem.Description, PublishingDate = fItem.PublishingDate, Author = fItem.Author }; var shrinkedTitle = StringUtils.UnsignString(StringUtils.RemoveNonAlphaCharactersAndDigit(feedItem.Title)).ToLower(); var shrinkedTitleHash = StringUtils.MD5Hash(shrinkedTitle); if (!SimpleFeedlyDatabaseAccess.IsBlackListWord(shrinkedTitleHash)) { var channelDomainGroup = string.IsNullOrEmpty(channel.DomainGroup) ? channel.Link : channel.DomainGroup; if (!SimpleFeedlyDatabaseAccess.IsExistedFeedItem(channel.Id, channelDomainGroup, feedItem.FeedItemKey)) { //var coverImageUrl = fItem.GetFeedCoverImage(); //if (!string.IsNullOrWhiteSpace(coverImageUrl)) //{ // feedItem.CoverImageUrl = coverImageUrl; //} insertItems.Add(feedItem); } } } SimpleFeedlyDatabaseAccess.InsertFeedItems(insertItems); _logger.Info($" - Inserted {insertItems.Count()} items"); } SimpleFeedlyDatabaseAccess.UpdateChannelErrorStatus(channel.Id, false, null); _logger.Info($" - Updated status"); } else { _logger.Info($" - [NO ITEMS]"); SimpleFeedlyDatabaseAccess.UpdateChannelErrorStatus(channel.Id, true, fetchFeedError == null ? null : JsonConvert.SerializeObject(fetchFeedError)); if (fetchFeedError != null) { ErrorHandle(fetchFeedError, feedUrl); } } } catch (Exception err) { _logger.Error($" - Got Error: {JsonConvert.SerializeObject(err, new JsonSerializerSettings { ReferenceLoopHandling = ReferenceLoopHandling.Ignore })}"); SimpleFeedlyDatabaseAccess.UpdateChannelErrorStatus(channel.Id, true, JsonConvert.SerializeObject(err)); ErrorHandle(err, feedUrl); } } } catch (Exception ex) { _logger.Error($" - [ERROR]: {JsonConvert.SerializeObject(ex, new JsonSerializerSettings { ReferenceLoopHandling = ReferenceLoopHandling.Ignore })}"); ErrorHandle(ex, feedUrl); } _logger.Info($"Done!"); }
static void Main(string[] args) { var appRootPath = EnvironmentHelper.GetApplicationRoot(); //WriteAllText(Path.Combine(appRootPath, "../", "README.md"), "Hello From Jin -- this file is auto committed " + DateTime.Now); RssCrawler.CrawlRss(); string indexFilePath = Path.Combine(appRootPath, "../", "index.html"); string indexContent = File.ReadAllText(indexFilePath, Encoding.UTF8); //var regex = new Regex(@"<!-- RSSDATA:START -->[\n\r]+(.*?)[\n\r]+<!-- RSSDATA:END -->"); //var match = regex.Match(indexContent); //var result = match.Groups[1].Value; var change = string.Empty; //$"Hello, this text is auto generated {DateTime.Now:yyy/MM/dd HH:mm:ss}"; var feedItems = SimpleFeedlyDatabaseAccess.GetAllFeedItems(); feedItems = feedItems.Where(x => !x.Channel.Title.Contains("medium", System.StringComparison.CurrentCultureIgnoreCase)).ToList(); // I hate medium rss ObjectId currentChannelId = null; var sb = new StringBuilder(); var sbChannel = new StringBuilder(); var counter = 1; foreach (var item in feedItems) { var isNewChannel = item.Channel.Id != currentChannelId; currentChannelId = item.Channel.Id; // just get top 20 feed items if (counter > 20 && !isNewChannel) { counter++; continue; } if (isNewChannel) { counter = 1; if (sbChannel.Length > 0) // has previous item { sbChannel.AppendLine("</ul>"); // div.row sb.Append(sbChannel); sbChannel = new StringBuilder(); } sbChannel.AppendLine($"<h2 class='channel-title'># <a href='{item.Channel.Title}' target='_blank'>{item.Channel.Title}</a></h2>"); sbChannel.AppendLine("<ul class='feed-items'>"); // new row } //sbChannel.Append(@$"<div class='column'> // <div class='card'> // <div class='feed-img-wrapper'><img src='{item.CoverImageUrl}' /></div> // <div class='feed-title'>{item.Title}</div> // </div> // </div>"); sbChannel.Append(@$ "<li><a href='{item.Link}'>{item.Title}</a></li>"); counter++; } sb.AppendLine("</ul>"); // div.row change = sb.ToString(); //var newContent = Regex.Replace(indexContent, $"<!-- RSSDATA:START -->[\n\r]+(.*?)[\n\r]+<!-- RSSDATA:END -->", string.Format("<!-- RSSDATA:START -->\n{0}\n<!-- RSSDATA:END -->", change)); var newContent = Regex.Replace(indexContent, $"<!-- RSSDATA:START -->(?:[^\n]*(\n+))+<!-- RSSDATA:END -->", string.Format("<!-- RSSDATA:START -->\n{0}\n<!-- RSSDATA:END -->", change)); WriteAllText(indexFilePath, newContent); //string assemblyFolder = Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location); //string crawlLogFolderPath= Path.Combine(assemblyFolder, "Logs"); //if (!Directory.Exists(crawlLogFolderPath)) //{ // Directory.CreateDirectory(crawlLogFolderPath); //} //string[] filePaths = Directory.GetFiles(crawlLogFolderPath); //foreach (var filename in filePaths) //{ // string targetFolderPath = Path.Combine(appRootPath, "..\\Logs\\"); // if (!Directory.Exists(targetFolderPath)) // { // Directory.CreateDirectory(targetFolderPath); // } // var targetFilePath = Path.Combine(targetFolderPath, Path.GetFileName(filename)); // File.Copy(filename, targetFilePath, true); //} }