public override async Task <List <NotableClip> > GetNotableClips(string url, CancellationToken token, HtmlDocument srcDoc = null, IProgressBar progress = null) { if (srcDoc == null) { srcDoc = new HtmlDocument(); srcDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(url)); } List <NotableClip> result = new List <NotableClip>(); HtmlNodeCollection quoteNodes = sourceHtmlDoc.DocumentNode.SelectNodes("//div[@id='WikiModule_Quotations']/div/ul[@class='li_6']/li"); if (quoteNodes != null) { foreach (HtmlNode quoteNode in quoteNodes) { HtmlNode node = quoteNode.SelectSingleNode(".//blockquote"); if (node == null) { continue; } string quote = node.InnerText; // Remove quotes (sometimes people put unnecessary quotes in the quote as well) quote = Regex.Replace(quote, "^(“){1,2}", ""); quote = Regex.Replace(quote, "(”){1,2}$", ""); result.Add(new NotableClip { Text = quote, Likes = 0 }); } } return(result); }
public override async Task <bool> GetPageCount(BookInfo curBook) { if (sourceHtmlDoc == null) { sourceHtmlDoc = new HtmlDocument(); sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(curBook.dataUrl)); } HtmlNode pagesNode = sourceHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='details']"); if (pagesNode == null) { return(false); } Match match = Regex.Match(pagesNode.InnerText, @"((\d+)|(\d+,\d+)) pages"); if (match.Success) { double minutes = int.Parse(match.Groups[1].Value, NumberStyles.AllowThousands) * 1.2890625; TimeSpan span = TimeSpan.FromMinutes(minutes); Logger.Log(String.Format("Typical time to read: {0} hours and {1} minutes ({2} pages)", span.Hours, span.Minutes, match.Groups[1].Value)); curBook.pagesInBook = int.Parse(match.Groups[1].Value); curBook.readingHours = span.Hours; curBook.readingMinutes = span.Minutes; return(true); } return(false); }
public override async Task <bool> GetPageCount(BookInfo curBook) { if (sourceHtmlDoc == null) { sourceHtmlDoc = new HtmlDocument(); sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(curBook.dataUrl)); } HtmlNode pageNode = sourceHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='WikiModule_FirstEdition']"); HtmlNode node1 = pageNode?.SelectSingleNode(".//div/div"); if (node1 == null) { return(false); } //Parse page count and multiply by average reading time Match match1 = Regex.Match(node1.InnerText, @"Page Count: ((\d+)|(\d+,\d+))"); if (match1.Success) { double minutes = int.Parse(match1.Groups[1].Value, NumberStyles.AllowThousands) * 1.2890625; TimeSpan span = TimeSpan.FromMinutes(minutes); Logger.Log(String.Format("Typical time to read: {0} hours and {1} minutes ({2} pages)", span.Hours, span.Minutes, match1.Groups[1].Value)); curBook.pagesInBook = int.Parse(match1.Groups[1].Value); curBook.readingHours = span.Hours; curBook.readingMinutes = span.Minutes; return(true); } return(false); }
/// <summary> /// Scrape any notable quotes from Goodreads and grab ratings if missing from book info /// Modifies curBook. /// </summary> public override async Task GetExtras(BookInfo curBook, CancellationToken token, IProgressBar progress = null) { if (sourceHtmlDoc == null) { sourceHtmlDoc = new HtmlDocument(); sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(curBook.dataUrl)); } if (curBook.notableClips == null) { curBook.notableClips = await GetNotableClips("", token, sourceHtmlDoc, progress).ConfigureAwait(false); } //Add rating and reviews count if missing from Amazon book info HtmlNode metaNode = sourceHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='bookMeta']"); if (metaNode != null && curBook.amazonRating == 0) { HtmlNode goodreadsRating = metaNode.SelectSingleNode("//span[@class='value rating']"); if (goodreadsRating != null) { curBook.amazonRating = float.Parse(goodreadsRating.InnerText); } HtmlNode passagesNode = metaNode.SelectSingleNode(".//a[@class='actionLinkLite votes' and @href='#other_reviews']") ?? metaNode.SelectSingleNode(".//span[@class='count value-title']"); if (passagesNode != null) { Match match = Regex.Match(passagesNode.InnerText, @"(\d+|\d{1,3}([,\.]\d{3})*)(?=\s)"); if (match.Success) { curBook.numReviews = int.Parse(match.Value.Replace(",", "").Replace(".", "")); } } } }
public override async Task <List <BookInfo> > SearchBook(string author, string title) { var goodreadsSearchUrlBase = @"https://www.goodreads.com/search?q={0}%20{1}"; title = Uri.EscapeDataString(title); author = Uri.EscapeDataString(Functions.FixAuthor(author)); var goodreadsHtmlDoc = new HtmlDocument(); goodreadsHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(String.Format(goodreadsSearchUrlBase, author, title))); return(!goodreadsHtmlDoc.DocumentNode.InnerText.Contains("No results") ? ParseSearchResults(goodreadsHtmlDoc) : null); }
public override async Task <List <XRay.Term> > GetTerms(string dataUrl, IProgressBar progress, CancellationToken token) { if (sourceHtmlDoc == null) { Logger.Log("Downloading Goodreads page..."); sourceHtmlDoc = new HtmlDocument(); sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(dataUrl)); } var charNodes = sourceHtmlDoc.DocumentNode.SelectNodes("//div[@class='infoBoxRowTitle' and text()='Characters']/../div[@class='infoBoxRowItem']/a"); if (charNodes == null) { return(new List <XRay.Term>()); } // Check if ...more link exists on Goodreads page var moreCharNodes = sourceHtmlDoc.DocumentNode.SelectNodes("//div[@class='infoBoxRowTitle' and text()='Characters']/../div[@class='infoBoxRowItem']/span[@class='toggleContent']/a"); var allChars = moreCharNodes == null ? charNodes : charNodes.Concat(moreCharNodes); var termCount = moreCharNodes == null ? charNodes.Count : charNodes.Count + moreCharNodes.Count; Logger.Log($"Gathering term information from Goodreads... ({termCount})"); progress?.Set(0, termCount); if (termCount > 20) { Logger.Log("More than 20 characters found. Consider using the 'download to XML' option if you need to build repeatedly."); } var terms = new ConcurrentBag <XRay.Term>(); await allChars.ParallelForEachAsync(async charNode => { try { terms.AddNotNull(await GetTerm(dataUrl, charNode.GetAttributeValue("href", "")).ConfigureAwait(false)); progress?.Add(1); } catch (Exception ex) { if (ex.Message.Contains("(404)")) { Logger.Log("Error getting page for character. URL: " + "https://www.goodreads.com" + charNode.GetAttributeValue("href", "") + "\r\nMessage: " + ex.Message + "\r\n" + ex.StackTrace); } } }, MaxConcurrentRequests, token); return(terms.ToList()); }
// Search Goodread for possible kindle edition of book and return ASIN. private async Task <string> SearchBookASIN(string id, string title) { string goodreadsBookUrl = String.Format("https://www.goodreads.com/book/show/{0}", id); try { HtmlDocument bookHtmlDoc = new HtmlDocument { OptionAutoCloseOnEnd = true }; bookHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(goodreadsBookUrl)); HtmlNode link = bookHtmlDoc.DocumentNode.SelectSingleNode("//div[@class='otherEditionsActions']/a"); Match match = Regex.Match(link.GetAttributeValue("href", ""), @"editions/([0-9]*)-"); if (match.Success) { string kindleEditionsUrl = String.Format("https://www.goodreads.com/work/editions/{0}?utf8=%E2%9C%93&sort=num_ratings&filter_by_format=Kindle+Edition", match.Groups[1].Value); bookHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(kindleEditionsUrl)); HtmlNodeCollection bookNodes = bookHtmlDoc.DocumentNode.SelectNodes("//div[@class='elementList clearFix']"); if (bookNodes != null) { foreach (HtmlNode book in bookNodes) { match = Regex.Match(book.InnerHtml, "(B[A-Z0-9]{9})"); if (match.Success) { return(match.Value); } } } } return(""); } catch (Exception ex) { Logger.Log(String.Format("An error occurred while searching for {0}s ASIN.\r\n", title) + ex.Message + "\r\n" + ex.StackTrace); return(""); } }
// TODO: All calls to Amazon should check for the captcha page (or ideally avoid it somehow) public static async Task <BookInfo> SearchBook(string title, string author, string TLD) { BookInfo result = null; if (title.IndexOf(" (") >= 0) { title = title.Substring(0, title.IndexOf(" (")); } //Search "all" Amazon string searchUrl = String.Format(@"https://www.amazon.{0}/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords={1}", TLD, Uri.EscapeDataString(title + " " + author)); HtmlDocument searchDoc = new HtmlDocument(); searchDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(searchUrl)); HtmlNode node = searchDoc.DocumentNode.SelectSingleNode("//li[@id='result_0']"); HtmlNode nodeASIN = node?.SelectSingleNode(".//a[@title='Kindle Edition']"); if (nodeASIN == null) { node = searchDoc.DocumentNode.SelectSingleNode("//li[@id='result_1']"); nodeASIN = node?.SelectSingleNode(".//a[@title='Kindle Edition']"); } //At least attempt to verify it might be the same book? if (node != null && nodeASIN != null && node.InnerText.IndexOf(title, StringComparison.OrdinalIgnoreCase) >= 0) { Match foundASIN = Regex.Match(nodeASIN.OuterHtml, "(B[A-Z0-9]{9})"); node = node.SelectSingleNode(".//div/div/div/div[@class='a-fixed-left-grid-col a-col-right']/div/a"); if (node != null) { result = new BookInfo(node.InnerText, author, foundASIN.Value); string trimUrl = nodeASIN.GetAttributeValue("href", ""); trimUrl = trimUrl.Substring(0, trimUrl.IndexOf(foundASIN.Value) + foundASIN.Length); result.amazonUrl = trimUrl; // Grab the true link for good measure } } return(result); }
// Are there actually any goodreads pages that aren't at goodreads.com for other languages?? private async Task <XRay.Term> GetTerm(string baseUrl, string relativeUrl) { XRay.Term result = new XRay.Term("character"); Uri tempUri = new Uri(baseUrl); tempUri = new Uri(new Uri(tempUri.GetLeftPart(UriPartial.Authority)), relativeUrl); result.DescSrc = "Goodreads"; result.DescUrl = tempUri.ToString(); HtmlDocument charDoc = new HtmlDocument(); charDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(tempUri.ToString())); HtmlNode mainNode = charDoc.DocumentNode.SelectSingleNode("//div[@class='mainContentFloat']") ?? charDoc.DocumentNode.SelectSingleNode("//div[@class='mainContentFloat ']"); result.TermName = mainNode.SelectSingleNode("./h1").InnerText; mainNode = mainNode.SelectSingleNode("//div[@class='grey500BoxContent']"); HtmlNodeCollection tempNodes = mainNode.SelectNodes("//div[@class='floatingBox']"); if (tempNodes == null) { return(result); } foreach (HtmlNode tempNode in tempNodes) { if (tempNode.Id.Contains("_aliases")) // If present, add any aliases found { string aliasStr = tempNode.InnerText.Replace("[close]", "").Trim(); result.Aliases.AddRange(aliasStr.Split(new [] { ", " }, StringSplitOptions.RemoveEmptyEntries)); } else { result.Desc = tempNode.InnerText.Replace("[close]", "").Trim(); } } return(result); }
/// <summary> /// Gather the list of quotes & number of times they've been liked -- close enough to "x paragraphs have been highlighted y times" from Amazon /// </summary> public override async Task <List <NotableClip> > GetNotableClips(string url, CancellationToken token, HtmlDocument srcDoc = null, IProgressBar progress = null) { if (srcDoc == null) { srcDoc = new HtmlDocument(); srcDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(url)); } HtmlNode quoteNode = srcDoc.DocumentNode.SelectSingleNode("//div[@class='h2Container gradientHeaderContainer']/h2/a[starts-with(.,'Quotes from')]"); if (quoteNode == null) { return(null); } string quoteURL = $"https://www.goodreads.com{quoteNode.GetAttributeValue("href", "")}?page={{0}}"; progress?.Set(0, 1); var quoteBag = new ConcurrentBag <IEnumerable <NotableClip> >(); var initialPage = new HtmlDocument(); initialPage.LoadHtml(await HttpDownloader.GetPageHtmlAsync(string.Format(quoteURL, 1))); // check how many pages there are (find previous page button, get parent div, take all children of that, 2nd last one should be the max page count HtmlNode maxPageNode = initialPage.DocumentNode.SelectSingleNode("//span[contains(@class,'previous_page')]/parent::div/*[last()-1]"); if (maxPageNode == null) { return(null); } if (!int.TryParse(maxPageNode.InnerHtml, out var maxPages)) { maxPages = 1; } IEnumerable <NotableClip> ParseQuotePage(HtmlDocument quoteDoc) { HtmlNodeCollection tempNodes = quoteDoc.DocumentNode.SelectNodes("//div[@class='quotes']/div[@class='quote']"); return(tempNodes?.Select(node => { var quoteMatch = Regex.Match(node.InnerText, "“(.*?)”", RegexOptions.Compiled); var likesMatch = Regex.Match(node.SelectSingleNode(".//div[@class='right']/a")?.InnerText ?? "", @"(\d+) likes", RegexOptions.Compiled); if (!quoteMatch.Success || !likesMatch.Success) { return null; } return new NotableClip { Text = quoteMatch.Groups[1].Value, Likes = int.Parse(likesMatch.Groups[1].Value) }; }).Where(quote => quote != null)); } quoteBag.Add(ParseQuotePage(initialPage)); progress?.Set(1, maxPages); await Enumerable.Range(2, maxPages).ParallelForEachAsync(async page => { var quotePage = new HtmlDocument(); quotePage.LoadHtml(await HttpDownloader.GetPageHtmlAsync(string.Format(quoteURL, page))); quoteBag.Add(ParseQuotePage(quotePage)); progress?.Add(1); }, MaxConcurrentRequests, token); return(quoteBag.Where(quotes => quotes != null && quotes.Any()).SelectMany(quotes => quotes).ToList()); }
public override async Task <BookInfo> GetNextInSeries(BookInfo curBook, AuthorProfile authorProfile, string TLD) { BookInfo nextBook = null; if (curBook.dataUrl == "") { return(null); } if (sourceHtmlDoc == null) { sourceHtmlDoc = new HtmlDocument(); sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(curBook.dataUrl)); } // Get title of next book Dictionary <string, string> seriesInfo = await GetNextInSeriesTitle(curBook).ConfigureAwait(false); if (seriesInfo.TryGetValue("Next", out var title)) { // Search author's other books for the book (assumes next in series was written by the same author...) // Returns the first one found, though there should probably not be more than 1 of the same name anyway nextBook = authorProfile.otherBooks.FirstOrDefault(bk => bk.title == title); if (nextBook == null) { // Attempt to search Amazon for the book instead nextBook = await Amazon.SearchBook(title, curBook.author, TLD); if (nextBook != null) { await nextBook.GetAmazonInfo(nextBook.amazonUrl); //fill in desc, imageurl, and ratings } } // Try to fill in desc, imageurl, and ratings using Shelfari Kindle edition link instead if (nextBook == null) { HtmlDocument bookDoc = new HtmlDocument { OptionAutoCloseOnEnd = true }; bookDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(seriesInfo["NextURL"])); Match match = Regex.Match(bookDoc.DocumentNode.InnerHtml, "('B[A-Z0-9]{9}')"); if (match.Success) { string cleanASIN = match.Value.Replace("'", String.Empty); nextBook = new BookInfo(title, curBook.author, cleanASIN); await nextBook.GetAmazonInfo("http://www.amazon.com/dp/" + cleanASIN); } } if (nextBook == null) { Logger.Log("Book was found to be part of a series, but an error occured finding the next book.\r\n" + "Please report this book and the Shelfari URL and output log to improve parsing."); } } else if (curBook.seriesPosition != curBook.totalInSeries.ToString()) { Logger.Log("An error occured finding the next book in series, the book may not be part of a series, or it is the latest release."); } if (seriesInfo.TryGetValue("Previous", out title)) { if (curBook.previousInSeries == null) { // Attempt to search Amazon for the book curBook.previousInSeries = await Amazon.SearchBook(title, curBook.author, TLD); if (curBook.previousInSeries != null) { await curBook.previousInSeries.GetAmazonInfo(curBook.previousInSeries.amazonUrl); //fill in desc, imageurl, and ratings } // Try to fill in desc, imageurl, and ratings using Shelfari Kindle edition link instead if (curBook.previousInSeries == null) { HtmlDocument bookDoc = new HtmlDocument() { OptionAutoCloseOnEnd = true }; bookDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(seriesInfo["PreviousURL"])); Match match = Regex.Match(bookDoc.DocumentNode.InnerHtml, "('B[A-Z0-9]{9}')"); if (match.Success) { string cleanASIN = match.Value.Replace("'", String.Empty); curBook.previousInSeries = new BookInfo(title, curBook.author, cleanASIN); await curBook.previousInSeries.GetAmazonInfo("http://www.amazon.com/dp/" + cleanASIN); } } } else { Logger.Log("Book was found to be part of a series, but an error occured finding the next book.\r\n" + "Please report this book and the Shelfari URL and output log to improve parsing."); } } return(nextBook); }
public override async Task <List <XRay.Term> > GetTerms(string dataUrl, IProgressBar progress, CancellationToken token) { Logger.Log("Downloading Shelfari page..."); List <XRay.Term> terms = new List <XRay.Term>(); if (sourceHtmlDoc == null) { sourceHtmlDoc = new HtmlDocument(); sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(dataUrl)); } //Constants for wiki processing Dictionary <string, string> sections = new Dictionary <string, string> { { "WikiModule_Characters", "character" }, { "WikiModule_Organizations", "topic" }, { "WikiModule_Settings", "topic" }, { "WikiModule_Glossary", "topic" } }; foreach (string header in sections.Keys) { HtmlNodeCollection characterNodes = sourceHtmlDoc.DocumentNode.SelectNodes("//div[@id='" + header + "']//ul[@class='li_6']/li"); if (characterNodes == null) { continue; //Skip section if not found on page } foreach (HtmlNode li in characterNodes) { string tmpString = li.InnerText; XRay.Term newTerm = new XRay.Term(sections[header]); //Create term as either character/topic if (tmpString.Contains(":")) { newTerm.TermName = tmpString.Substring(0, tmpString.IndexOf(":")); newTerm.Desc = tmpString.Substring(tmpString.IndexOf(":") + 1).Replace("&", "&").Trim(); } else { newTerm.TermName = tmpString; } newTerm.DescSrc = "shelfari"; //Use either the associated shelfari URL of the term or if none exists, use the book's url newTerm.DescUrl = (li.InnerHtml.IndexOf("<a href") == 0 ? li.InnerHtml.Substring(9, li.InnerHtml.IndexOf("\"", 9) - 9) : dataUrl); if (header == "WikiModule_Glossary") { newTerm.MatchCase = false; } //Default glossary terms to be case insensitive when searching through book if (terms.Select(t => t.TermName).Contains(newTerm.TermName)) { Logger.Log("Duplicate term \"" + newTerm.TermName + "\" found. Ignoring this duplicate."); } else { terms.Add(newTerm); } } } return(terms); }
/// <summary> /// Search Shelfari page for possible series info, returning the next title in the series without downloading any other pages. /// </summary> private async Task <Dictionary <string, string> > GetNextInSeriesTitle(BookInfo curBook) { if (curBook.dataUrl == "") { return(null); } if (sourceHtmlDoc == null) { sourceHtmlDoc = new HtmlDocument(); sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(curBook.dataUrl)); } Dictionary <string, string> results = new Dictionary <string, string>(2); //Added estimated reading time and page count from Shelfari, for now... HtmlNode pageNode = sourceHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='WikiModule_FirstEdition']"); if (pageNode == null) { return(results); } HtmlNode node1 = pageNode.SelectSingleNode(".//div/div"); if (node1 == null) { return(results); } //Check if book series is available and displayed in Series & Lists on Shelfari page. HtmlNode seriesNode = sourceHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='WikiModule_Series']/div"); if (seriesNode != null) { //If multiple Series found, find and use standard series. foreach (HtmlNode seriesType in seriesNode.SelectNodes(".//div")) { if (seriesType.InnerText.Contains("(standard series)", StringComparison.OrdinalIgnoreCase) && !seriesType.InnerText.Contains("(Reading Order)", StringComparison.OrdinalIgnoreCase)) { Match match = Regex.Match(seriesType.InnerText, @"This is book (\d+) of (\d+) in (.+)\."); if (!match.Success || match.Groups.Count != 4) { continue; } Logger.Log("About the series: " + seriesType.InnerText.Replace(". (standard series)", "")); curBook.seriesPosition = match.Groups[1].Value; curBook.totalInSeries = int.Parse(match.Groups[2].Value); curBook.seriesName = match.Groups[3].Value; HtmlNode seriesInfo = seriesNode.SelectSingleNode(".//p"); //Parse preceding book if (seriesInfo != null && seriesInfo.InnerText.Contains("Preceded by ", StringComparison.OrdinalIgnoreCase)) { match = Regex.Match(seriesInfo.InnerText, @"Preceded by (.*),", RegexOptions.IgnoreCase); if (match.Success && match.Groups.Count == 2) { results["Previous"] = match.Groups[1].Value; } else { match = Regex.Match(seriesInfo.InnerText, @"Preceded by (.*)\.", RegexOptions.IgnoreCase); if (match.Success && match.Groups.Count == 2) { results["Previous"] = match.Groups[1].Value; } } Logger.Log("Preceded by: " + match.Groups[1].Value); //Grab Shelfari Kindle edition link for this book results["PreviousURL"] = seriesInfo.ChildNodes["a"].GetAttributeValue("href", "") + "/editions?binding=Kindle"; } // Check if book is the last in the series if (!curBook.seriesPosition.Equals(curBook.totalInSeries)) { //Parse following book if (seriesInfo != null && seriesInfo.InnerText.Contains("followed by ", StringComparison.OrdinalIgnoreCase)) { match = Regex.Match(seriesInfo.InnerText, @"followed by (.*)\.", RegexOptions.IgnoreCase); if (match.Success && match.Groups.Count == 2) { Logger.Log("Followed by: " + match.Groups[1].Value); //Grab Shelfari Kindle edition link for this book results["NextURL"] = seriesInfo.ChildNodes["a"].GetAttributeValue("href", "") + "/editions?binding=Kindle"; results["Next"] = match.Groups[1].Value; return(results); } } } break; } } } return(results); }
public static async Task <NextBookResult> DownloadNextInSeries(string asin) { var response = await HttpDownloader.GetPageHtmlAsync($"https://www.revensoftware.com/amazon/next/{asin}"); return(JsonConvert.DeserializeObject <NextBookResult>(response)); }
public static Task <string> DownloadStartActions(string asin) => HttpDownloader.GetPageHtmlAsync($"https://www.revensoftware.com/amazon/sa/{asin}");
/// <summary> /// Search Goodread for possible series info, returning the next title in the series. /// Modifies curBook. /// </summary> private async Task <Dictionary <string, BookInfo> > GetNextInSeriesTitle(BookInfo curBook) { Dictionary <string, BookInfo> results = new Dictionary <string, BookInfo>(2); if (sourceHtmlDoc == null) { sourceHtmlDoc = new HtmlDocument(); sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(curBook.dataUrl)); } //Search Goodreads for series info string goodreadsSeriesUrl = @"https://www.goodreads.com/series/{0}"; HtmlNode metaNode = sourceHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='bookMeta']"); HtmlNode seriesNode = metaNode?.SelectSingleNode("//h1[@id='bookTitle']/a"); if (seriesNode == null) { return(results); } var match = Regex.Match(seriesNode.OuterHtml, @"/series/([0-9]*)"); if (!match.Success) { return(results); } goodreadsSeriesUrl = String.Format(goodreadsSeriesUrl, match.Groups[1].Value); match = Regex.Match(seriesNode.InnerText, @"\((.*) #?([0-9]*([.,][0-9])?)\)"); if (match.Success) { Logger.Log($"Series Goodreads Page URL: {goodreadsSeriesUrl}"); curBook.seriesName = match.Groups[1].Value.Trim(); curBook.seriesPosition = match.Groups[2].Value; } else { return(results); } HtmlDocument seriesHtmlDoc = new HtmlDocument { OptionAutoCloseOnEnd = true }; seriesHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(goodreadsSeriesUrl)); seriesNode = seriesHtmlDoc.DocumentNode.SelectSingleNode("//div[contains(@class, 'responsiveSeriesHeader__subtitle')]"); match = Regex.Match(seriesNode?.InnerText ?? "", @"([0-9]+) (?:primary )?works?"); if (match.Success) { curBook.totalInSeries = int.Parse(match.Groups[1].Value); } int positionInt = (int)Convert.ToDouble(curBook.seriesPosition, CultureInfo.InvariantCulture.NumberFormat); int totalInt = (int)Convert.ToDouble(curBook.totalInSeries, CultureInfo.InvariantCulture.NumberFormat); Logger.Log($"This is book {curBook.seriesPosition} of {curBook.totalInSeries} in the {curBook.seriesName} series"); HtmlNodeCollection bookNodes = seriesHtmlDoc.DocumentNode.SelectNodes("//div[@itemtype='http://schema.org/Book']"); string prevSearch = curBook.seriesPosition.Contains(".") ? $"book {positionInt}" : $"book {positionInt - 1}"; string nextSearch = $"book {positionInt + 1}"; if (bookNodes != null) { foreach (HtmlNode book in bookNodes) { var bookIndex = book.SelectSingleNode(".//div[@class='responsiveBook__header']")?.InnerText.ToLower(); if (bookIndex == null) { continue; } // TODO: Combine these if (results.Count == 0 && bookIndex == prevSearch) { BookInfo prevBook = new BookInfo("", "", ""); var title = book.SelectSingleNode(".//div[@class='u-paddingBottomXSmall']/a"); prevBook.title = Regex.Replace(title.InnerText.Trim(), @" \(.*\)", "", RegexOptions.Compiled); match = Regex.Match(title.GetAttributeValue("href", ""), @"show/([0-9]+)"); if (match.Success) { prevBook.asin = await SearchBookASIN(match.Groups[1].Value, prevBook.title).ConfigureAwait(false); } prevBook.author = book.SelectSingleNode(".//span[@itemprop='author']//a")?.InnerText.Trim() ?? ""; results["Previous"] = prevBook; curBook.previousInSeries = prevBook; Logger.Log($"Preceded by: {prevBook.title}"); continue; } if (bookIndex == nextSearch) { BookInfo nextBook = new BookInfo("", "", ""); var title = book.SelectSingleNode(".//div[@class='u-paddingBottomXSmall']/a"); nextBook.title = Regex.Replace(title.InnerText.Trim(), @" \(.*\)", "", RegexOptions.Compiled); match = Regex.Match(title.GetAttributeValue("href", ""), @"show/([0-9]+)"); if (match.Success) { nextBook.asin = await SearchBookASIN(match.Groups[1].Value, nextBook.title).ConfigureAwait(false); } nextBook.author = book.SelectSingleNode(".//span[@itemprop='author']//a")?.InnerText.Trim() ?? ""; results["Next"] = nextBook; curBook.nextInSeries = nextBook; Logger.Log($"Followed by: {nextBook.title}"); } if (results.Count == 2 || results.Count == 1 && positionInt == totalInt) { break; // next and prev found or prev found and latest in series } } } return(results); }
/// <summary> /// Searches for the next and previous books in a series, if it is part of one. /// Modifies curBook.previousInSeries to contain the found book info. /// </summary> /// <returns>Next book in series</returns> public override async Task <BookInfo> GetNextInSeries(BookInfo curBook, AuthorProfile authorProfile, string TLD) { BookInfo nextBook = null; if (curBook.dataUrl == "") { return(null); } if (sourceHtmlDoc == null) { sourceHtmlDoc = new HtmlDocument(); sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(curBook.dataUrl)); } // Get title of next book Dictionary <string, BookInfo> seriesInfo = await GetNextInSeriesTitle(curBook); if (seriesInfo.TryGetValue("Next", out var book)) { // TODO: next and previous sections are the same... // Search author's other books for the book (assumes next in series was written by the same author...) // Returns the first one found, though there should probably not be more than 1 of the same name anyway nextBook = authorProfile.otherBooks.FirstOrDefault(bk => Regex.IsMatch(bk.title, $@"^{book.title}(?: \(.*\))?$")); if (nextBook == null) { // Attempt to search Amazon for the book instead // TODO: This should be elsewhere try { if (!string.IsNullOrEmpty(book.asin)) { nextBook = book; await nextBook.GetAmazonInfo($"https://www.amazon.{TLD}/dp/{book.asin}"); } else { nextBook = await Amazon.SearchBook(book.title, book.author, TLD); } if (nextBook == null && settings.promptASIN) { Logger.Log($"ASIN prompt for {book.title}..."); nextBook = new BookInfo(book.title, book.author, ""); frmAS.Text = "Next in Series"; frmAS.lblTitle.Text = book.title; frmAS.tbAsin.Text = ""; frmAS.ShowDialog(); Logger.Log($"ASIN supplied: {frmAS.tbAsin.Text}"); string Url = $"https://www.amazon.{TLD}/dp/{frmAS.tbAsin.Text}"; await nextBook.GetAmazonInfo(Url); nextBook.amazonUrl = Url; nextBook.asin = frmAS.tbAsin.Text; } } catch { Logger.Log($"Failed to find {book.title} on Amazon.{TLD}, trying again with Amazon.com."); TLD = "com"; nextBook = await Amazon.SearchBook(book.title, book.author, TLD); } if (nextBook != null) { await nextBook.GetAmazonInfo(nextBook.amazonUrl); //fill in desc, imageurl, and ratings } } if (nextBook == null) { Logger.Log("Book was found to be part of a series, but an error occurred finding the next book.\r\n" + "Please report this book and the Goodreads URL and output log to improve parsing (if it's a real book)."); } } else if (curBook.totalInSeries == 0) { Logger.Log("The book was not found to be part of a series."); } else if (curBook.seriesPosition != curBook.totalInSeries.ToString() && !curBook.seriesPosition?.Contains(".") == true) { Logger.Log("An error occurred finding the next book in series. The book may not be part of a series, or it is the latest release."); } if (seriesInfo.TryGetValue("Previous", out book)) { var prevBook = authorProfile.otherBooks.FirstOrDefault(bk => Regex.IsMatch(bk.title, $@"^{book.title}(?: \(.*\))?$")); if (book.asin != null) { prevBook = book; await prevBook.GetAmazonInfo($"https://www.amazon.{TLD}/dp/{book.asin}"); } else if (prevBook != null) { await prevBook.GetAmazonInfo(prevBook.amazonUrl); } if (prevBook == null && settings.promptASIN) { Logger.Log($"ASIN prompt for {book.title}..."); prevBook = new BookInfo(book.title, book.author, ""); frmAS.Text = "Previous in Series"; frmAS.lblTitle.Text = book.title; frmAS.tbAsin.Text = ""; frmAS.ShowDialog(); Logger.Log($"ASIN supplied: {frmAS.tbAsin.Text}"); string Url = $"https://www.amazon.{TLD}/dp/{frmAS.tbAsin.Text}"; await prevBook.GetAmazonInfo(Url); prevBook.amazonUrl = Url; prevBook.asin = frmAS.tbAsin.Text; } if (prevBook == null) { Logger.Log("Book was found to be part of a series, but an error occurred finding the previous book.\r\n" + "Please report this book and the Goodreads URL and output log to improve parsing."); } } return(nextBook); }
public static async Task <AuthorSearchResults> SearchAuthor(BookInfo curBook, string TLD) { AuthorSearchResults results = new AuthorSearchResults(); //Generate Author search URL from author's name string newAuthor = Functions.FixAuthor(curBook.author); string plusAuthorName = newAuthor.Replace(" ", "+"); //Updated to match Search "all" Amazon string amazonAuthorSearchUrl = $"https://www.amazon.{TLD}/s/ref=nb_sb_noss_2?url=search-alias%3Dstripbooks&field-keywords={plusAuthorName}"; Logger.Log($"Searching for author's page on Amazon.{TLD}..."); // Search Amazon for Author results.authorHtmlDoc = new HtmlDocument { OptionAutoCloseOnEnd = true }; results.authorHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(amazonAuthorSearchUrl)); if (Properties.Settings.Default.saveHtml) { try { Logger.Log("Saving Amazon's author search webpage..."); File.WriteAllText(Environment.CurrentDirectory + $"\\dmp\\{curBook.asin}.authorsearchHtml.txt", results.authorHtmlDoc.DocumentNode.InnerHtml); } catch (Exception ex) { Logger.Log(String.Format("An error ocurred saving authorsearchHtml.txt: {0}", ex.Message)); } } // Check for captcha // TODO: Try to prompt for captcha and have user complete it to continue if (results.authorHtmlDoc.DocumentNode.InnerText.Contains("Robot Check")) { Logger.Log($"Warning: Amazon.{TLD} is requesting a captcha." + $"You can try visiting Amazon.{TLD} in a real browser first, try another region, or try again later."); } // Try to find Author's page from Amazon search HtmlNode node = results.authorHtmlDoc.DocumentNode.SelectSingleNode("//*[@id='result_1']"); if (node == null || !node.OuterHtml.Contains("/e/B")) { Logger.Log($"An error occurred finding author's page on Amazon.{TLD}." + "\r\nUnable to create Author Profile." + "\r\nEnsure the author metadata field matches the author's name exactly." + $"\r\nSearch results can be viewed at {amazonAuthorSearchUrl}"); return(null); } string properAuthor = ""; // Check for typical search results, second item is the author page if ((node = node.SelectSingleNode("//*[@id='result_1']/div/div/div/div/a")) != null) { properAuthor = node.GetAttributeValue("href", ""); results.authorAsin = node.GetAttributeValue("data-asin", null) ?? AsinFromUrl(properAuthor); } // otherwise check for "by so-and-so" text beneath the titles for a possible match else if ((node = results.authorHtmlDoc.DocumentNode.SelectSingleNode($"//div[@id='resultsCol']//li[@class='s-result-item celwidget ']//a[text()=\"{newAuthor}\"]")) != null) { properAuthor = node.GetAttributeValue("href", ""); results.authorAsin = AsinFromUrl(properAuthor); } if (string.IsNullOrEmpty(properAuthor) || properAuthor.IndexOf('/', 1) < 3 || results.authorAsin == "") { Logger.Log("Unable to parse author's page URL properly. Try again later or report this URL on the MobileRead thread: " + amazonAuthorSearchUrl); return(null); } properAuthor = properAuthor.Substring(1, properAuthor.IndexOf('/', 1) - 1); string authorAmazonWebsiteLocationLog = @"https://www.amazon." + TLD + "/" + properAuthor + "/e/" + results.authorAsin; string authorAmazonWebsiteLocation = @"https://www.amazon." + TLD + "/" + properAuthor + "/e/" + results.authorAsin + "/ref=la_" + results.authorAsin + "_rf_p_n_feature_browse-b_2?fst=as%3Aoff&rh=n%3A283155%2Cp_82%3A" + results.authorAsin + "%2Cp_n_feature_browse-bin%3A618073011&bbn=283155&ie=UTF8&qid=1432378570&rnid=618072011"; curBook.authorAsin = results.authorAsin; Logger.Log($"Author page found on Amazon!\r\nAuthor's Amazon Page URL: {authorAmazonWebsiteLocationLog}"); // Load Author's Amazon page string authorpageHtml; try { authorpageHtml = await HttpDownloader.GetPageHtmlAsync(authorAmazonWebsiteLocation); } catch { // If page not found (on co.uk at least, the long form does not seem to work) fallback to short form // and pray the formatting/item display suits our needs. If short form not found, crash back to caller. authorpageHtml = await HttpDownloader.GetPageHtmlAsync(authorAmazonWebsiteLocationLog); } results.authorHtmlDoc.LoadHtml(authorpageHtml); return(results); }