/// <summary> /// Main method for web crawler. /// <para> /// Argument 1 is log level. /// Argument 2 is word limit. /// Argument 3 is list of words separated by commas to be excluded in the crawler. /// There cannot be spaces between the words unless the list is in " ". /// </para> /// </summary> /// /// <param name="args"> /// </param> static void Main(string[] args) { // get the command line args int wordLimit = 10; List <string> excludedWords = new List <string>(); string level = "Information"; if (args.Length > 0) { if (args.Length >= 1) { var arg = args[0].Trim(); level = arg; } if (args.Length >= 2) { var arg = args[1].Trim(); if (!Int32.TryParse(arg, out wordLimit)) { Log.Logger.Debug($"Could not set word limit to desired configuration {arg}."); } } if (args.Length >= 3) { var arg = args[2].Trim().Replace(" ", ","); excludedWords = arg.TokenizeToList(","); } } // configure the logger LogHelper.ConfigureLogger(level); Log.Logger.Information($"Starting Crawler with configurations: Log_Level={level}, Word_Limit={wordLimit}, Excluded_Words={excludedWords.ToPrettyString()}"); // start the crawler CrawlerResult result = new CrawlerResult(); using (var crawler = new Crawler(wordLimit, excludedWords)) { Task.Run(async() => { result = await crawler.CrawlAsync(); }).Wait(); } if (result.Error != null) { Log.Logger.Error($"Crawl failed with Error: {result.Error.Message}\n{result.Error.StackTrace}"); return; } Log.Logger.Information($"\n***MOST FREQUENT*** \n{result.Words.ToPrettyString()}"); }
/// <summary> /// Crawls the specified web page collecting counts of each unique /// <see cref="string"/> word in the History section. /// </summary> /// /// <param name="rootNode">The <see cref="HtmlNode"/> root node to start crawl at. /// USED PRIMARILY FOR TESTING. THERE IS NOT NEED TO SET THIS VALUE EXPLICITLY /// OUTSIDE OF TEST ENVIRONMENT.</param> /// <param name="endName">The <see cref="string"/> name of the <see cref="HtmlNode"/> /// to terminate on. USED PRIMARILY FOR TESTING. THERE IS NOT NEED TO SET THIS /// VALUE EXPLICITLY OUTSIDE OF TEST ENVIRONMENT. /// /// <returns>A <see cref="Task{CrawlerResult}"/> representing the async operation where /// the result is the <see cref="CrawlerResult"/> of the crawler execution. A /// 0 success code indicates success. A negative success code indicates a failure.</returns> public async Task <CrawlerResult> CrawlAsync(HtmlNode rootNode = null, string endName = "h2") { CrawlerResult result = new CrawlerResult(); string html = null; // execute async request to crawl base address await retryPolicy.ExecuteAsync(async() => { // get the page content HttpResponseMessage response = null; try { response = await client.GetAsync(""); } catch (Exception e) { result.SuccessCode = -1; result.Error = new Exception($"{e.Message}\n{e.StackTrace}"); return; } if (!response.IsSuccessStatusCode) { result.SuccessCode = -1; result.Error = new Exception(response.ReasonPhrase); return; } // read the string content try { html = await response.Content.ReadAsStringAsync(); } catch (Exception e) { result.SuccessCode = -1; result.Error = e; } }); if (result.Error != null) { return(result); } // load the HTML document // and count the words var document = new HtmlDocument(); document.LoadHtml(html); try { await CountWordsAsync(document, rootNode, endName); } catch (Exception e) { result.SuccessCode = -1; result.Error = e; } if (result.Error != null) { return(result); } result.Words = GetMostFrequentWords(); return(result); }