Example #1
0
        private static void PageCrawlCompleted(object sender, PageCrawlCompletedArgs e)
        {
            if (e.CrawledPage.HttpRequestException != null)
            {
                return;
            }

            IParserSettings settings = e.CrawlContext.CrawlBag.Settings;

            if (!settings.IsPageParseAllowed(e.CrawledPage))
            {
                return;
            }

            IParser <NewsData> parser = e.CrawlContext.CrawlBag.Parser;

            var news = parser.Parse(e.CrawledPage.AngleSharpHtmlDocument, e.CrawledPage.Uri.AbsoluteUri);

            if (news == null) // something went wrong
            {
                return;
            }

            news_list.Add(news);


            var splitted = TextProcessingHelper.TextSplittingAndRemovingSymbols(news.Text);

            TextProcessingHelper.CountFrequentWords(ref words_dictionary, splitted);



            Console.WriteLine(e.CrawledPage.Uri);
            Console.WriteLine("=================================");
        }
Example #2
0
        static async Task Main(string[] args)
        {
            #region Crawler
            Console.WriteLine("Demo starting up!");

            IParserSettings    settings = new NurkzSettings();
            IParser <NewsData> parser   = new NurkzParser();
            await DemoSimpleCrawler(settings, parser);

            var news = news_list.ToArray(); // converting BlockingCollection<T> to array
            DbHelper.InsertOrSkipIfExistNewsData(ref news);

            var existing_news = news.Where(w => w.IsExist).Select(s => s.Text).ToArray(); // exclude existing in DB "News"
            var words         = TextProcessingHelper.ConcatenateMultipleTextToWords(existing_news);

            TextProcessingHelper.RemoveOrDecreaseFrequentWords(ref words_dictionary, words);

            DbHelper.InsertFrequentWords(words_dictionary);

            Console.WriteLine("Press to terminate...");
            Console.ReadKey();
            #endregion
        }