Пример #1
0
        /// <summary>
        /// Main method for web crawler.
        /// <para>
        /// Argument 1 is log level.
        /// Argument 2 is word limit.
        /// Argument 3 is list of words separated by commas to be excluded in the crawler.
        /// There cannot be spaces between the words unless the list is in " ".
        /// </para>
        /// </summary>
        ///
        /// <param name="args">
        /// </param>
        static void Main(string[] args)
        {
            // get the command line args

            int           wordLimit     = 10;
            List <string> excludedWords = new List <string>();
            string        level         = "Information";

            if (args.Length > 0)
            {
                if (args.Length >= 1)
                {
                    var arg = args[0].Trim();
                    level = arg;
                }
                if (args.Length >= 2)
                {
                    var arg = args[1].Trim();
                    if (!Int32.TryParse(arg, out wordLimit))
                    {
                        Log.Logger.Debug($"Could not set word limit to desired configuration {arg}.");
                    }
                }
                if (args.Length >= 3)
                {
                    var arg = args[2].Trim().Replace(" ", ",");
                    excludedWords = arg.TokenizeToList(",");
                }
            }

            // configure the logger

            LogHelper.ConfigureLogger(level);

            Log.Logger.Information($"Starting Crawler with configurations: Log_Level={level}, Word_Limit={wordLimit}, Excluded_Words={excludedWords.ToPrettyString()}");

            // start the crawler

            CrawlerResult result = new CrawlerResult();

            using (var crawler = new Crawler(wordLimit, excludedWords))
            {
                Task.Run(async() =>
                {
                    result = await crawler.CrawlAsync();
                }).Wait();
            }

            if (result.Error != null)
            {
                Log.Logger.Error($"Crawl failed with Error: {result.Error.Message}\n{result.Error.StackTrace}");
                return;
            }

            Log.Logger.Information($"\n***MOST FREQUENT*** \n{result.Words.ToPrettyString()}");
        }
Пример #2
0
        /// <summary>
        /// Crawls the specified web page collecting counts of each unique
        /// <see cref="string"/> word in the History section.
        /// </summary>
        ///
        /// <param name="rootNode">The <see cref="HtmlNode"/> root node to start crawl at.
        /// USED PRIMARILY FOR TESTING.  THERE IS NOT NEED TO SET THIS VALUE EXPLICITLY
        /// OUTSIDE OF TEST ENVIRONMENT.</param>
        /// <param name="endName">The <see cref="string"/> name of the <see cref="HtmlNode"/>
        /// to terminate on.  USED PRIMARILY FOR TESTING.  THERE IS NOT NEED TO SET THIS
        /// VALUE EXPLICITLY OUTSIDE OF TEST ENVIRONMENT.
        ///
        /// <returns>A <see cref="Task{CrawlerResult}"/> representing the async operation where
        /// the result is the <see cref="CrawlerResult"/> of the crawler execution. A
        /// 0 success code indicates success.  A negative success code indicates a failure.</returns>
        public async Task <CrawlerResult> CrawlAsync(HtmlNode rootNode = null, string endName = "h2")
        {
            CrawlerResult result = new CrawlerResult();
            string        html   = null;

            // execute async request to crawl base address

            await retryPolicy.ExecuteAsync(async() =>
            {
                // get the page content

                HttpResponseMessage response = null;

                try
                {
                    response = await client.GetAsync("");
                }
                catch (Exception e)
                {
                    result.SuccessCode = -1;
                    result.Error       = new Exception($"{e.Message}\n{e.StackTrace}");

                    return;
                }

                if (!response.IsSuccessStatusCode)
                {
                    result.SuccessCode = -1;
                    result.Error       = new Exception(response.ReasonPhrase);

                    return;
                }

                // read the string content

                try
                {
                    html = await response.Content.ReadAsStringAsync();
                }
                catch (Exception e)
                {
                    result.SuccessCode = -1;
                    result.Error       = e;
                }
            });

            if (result.Error != null)
            {
                return(result);
            }

            // load the HTML document
            // and count the words

            var document = new HtmlDocument();

            document.LoadHtml(html);

            try
            {
                await CountWordsAsync(document, rootNode, endName);
            }
            catch (Exception e)
            {
                result.SuccessCode = -1;
                result.Error       = e;
            }

            if (result.Error != null)
            {
                return(result);
            }

            result.Words = GetMostFrequentWords();

            return(result);
        }