static void Main(string[] args) { Console.Title = "Quote Generator by Mike"; string url = "https://www.goodreads.com/quotes"; Console.WriteLine("Generate quotes by keyword (separate with spaces/commas): "); string keywords = Console.ReadLine(); Console.WriteLine("\nEnter the page to query.\nTo get a range of pages, enter the start and end numbers of the range, separated by commas: "); string numOfPagesString = Console.ReadLine(); PageAndUrl pageAndUrl = new PageAndUrl(url); UserInput.GetPages(pageAndUrl, numOfPagesString); UserInput.GetUrl(pageAndUrl, keywords); Scraper scraper = new Scraper(); stopwatch.Start(); scraper.PageScraperAsync(pageAndUrl).Wait(); stopwatch.Stop(); Console.WriteLine($"\nFinished. Time Elapsed: {stopwatch.Elapsed}"); Console.ReadKey(); }
public static void GetUrl(PageAndUrl pageAndUrl, string keywords) { StringBuilder newUrl = new StringBuilder(); newUrl.Append(pageAndUrl.url); // Getting URL Stem if (!string.IsNullOrEmpty(keywords)) { string[] keywordList = keywords.Split(delim, StringSplitOptions.RemoveEmptyEntries); int numOfKeywords = keywordList.Length; newUrl.Append($"/tag?id={keywordList[0]}"); if (numOfKeywords > 1) { for (int i = 1; i < numOfKeywords; i++) { newUrl.Append($"+{keywordList[i]}"); } } } newUrl.Append("&page="); pageAndUrl.url = newUrl.ToString(); }
public async Task PageScraperAsync(PageAndUrl pageAndUrl) { #region Asyncronously loads web pages if (pageAndUrl.pages.Count == 1) { int startPage; if (pageAndUrl.pages[0] == 1) { startPage = pageAndUrl.pages[0]; } else { startPage = await MaxPossiblePageCheck(pageAndUrl.url, pageAndUrl.pages[0]); } string urlPage = pageAndUrl.url + $"{startPage}&utf8=✓"; htmlDocsList.Add(Task.Run(() => web.LoadFromWebAsync(urlPage))); } else { int startPage = pageAndUrl.pages[0]; int endPage = await MaxPossiblePageCheck(pageAndUrl.url, pageAndUrl.pages[1]); for (int i = startPage; i <= endPage; i++) { htmlDocsList.Add(Task.Run(() => web.LoadFromWebAsync(pageAndUrl.url + $"{i}&utf8=✓"))); } } var allHtmlDocuments = await Task.WhenAll(htmlDocsList); Console.WriteLine("\nWebpages Downloaded.\n"); #endregion #region Exatracts quote and author html nodes from each page foreach (HtmlDocument document in allHtmlDocuments) { QuoteAndAuthorNodes.Add(GetQuoteAndAuthorNode(document)); } QuoteHtmlData[] QuoteAuthorNodesArray = QuoteAndAuthorNodes.ToArray(); Console.WriteLine("Nodes extracted.\n"); #endregion #region Stores quotes & authors into quoteData foreach (QuoteHtmlData node in QuoteAuthorNodesArray) { GetQuotesAndAuthorsText(node); } Console.WriteLine("Quotes and authors parsed.\n"); #endregion }
public static void GetPages(PageAndUrl pageAndUrl, string numOfPagesString) { numOfPagesString = PageInputNullCheck(numOfPagesString); string[] pageRangeString = numOfPagesString.Split(stringSplit, StringSplitOptions.RemoveEmptyEntries); List <string> pageRange = new List <string>() { pageRangeString[0] }; int pageRangeStrlength = pageRangeString.Length; if (pageRangeStrlength > 1) { for (int i = 0; i < pageRangeStrlength; i++) { if (!pageRange[0].Equals(pageRangeString[i])) { pageRange.Add(pageRangeString[i]); break; } } } List <int> pages = pageRange.ConvertAll(x => Convert.ToInt32(x)); for (int i = 0; i < pages.Count; i++) { if (pages[i] <= 0) { pages[i] = 1; } } if (pages.Count > 1 && pages[1] < pages[0]) { SwapPages(pages); } pageAndUrl.pages = pages.Distinct().ToList(); }