private static void PageCrawlCompleted(object sender, PageCrawlCompletedArgs e) { if (e.CrawledPage.HttpRequestException != null) { return; } IParserSettings settings = e.CrawlContext.CrawlBag.Settings; if (!settings.IsPageParseAllowed(e.CrawledPage)) { return; } IParser <NewsData> parser = e.CrawlContext.CrawlBag.Parser; var news = parser.Parse(e.CrawledPage.AngleSharpHtmlDocument, e.CrawledPage.Uri.AbsoluteUri); if (news == null) // something went wrong { return; } news_list.Add(news); var splitted = TextProcessingHelper.TextSplittingAndRemovingSymbols(news.Text); TextProcessingHelper.CountFrequentWords(ref words_dictionary, splitted); Console.WriteLine(e.CrawledPage.Uri); Console.WriteLine("================================="); }
static async Task Main(string[] args) { #region Crawler Console.WriteLine("Demo starting up!"); IParserSettings settings = new NurkzSettings(); IParser <NewsData> parser = new NurkzParser(); await DemoSimpleCrawler(settings, parser); var news = news_list.ToArray(); // converting BlockingCollection<T> to array DbHelper.InsertOrSkipIfExistNewsData(ref news); var existing_news = news.Where(w => w.IsExist).Select(s => s.Text).ToArray(); // exclude existing in DB "News" var words = TextProcessingHelper.ConcatenateMultipleTextToWords(existing_news); TextProcessingHelper.RemoveOrDecreaseFrequentWords(ref words_dictionary, words); DbHelper.InsertFrequentWords(words_dictionary); Console.WriteLine("Press to terminate..."); Console.ReadKey(); #endregion }