/// <summary> /// As of 2018-07-31, format changed. For some amount of time, keep both just in case. /// TODO: Switch to Kindle section and grab only those instead /// </summary> public static List <BookInfo> GetAuthorBooksNew(AuthorSearchResults searchResults, string curTitle, string curAuthor, string TLD) { var resultsNodes = searchResults.authorHtmlDoc.DocumentNode.SelectNodes("//div[@id='searchWidget']/div"); if (resultsNodes == null) { return(null); } var bookList = new List <BookInfo>(resultsNodes.Count); foreach (var result in resultsNodes) { if (result.InnerHtml.Contains("a-pagination")) { continue; } var bookNodes = result.SelectNodes(".//div[@class='a-fixed-right-grid-inner']/div/div") ?? throw new DataSource.FormatChangedException(nameof(Amazon), "book results - title nodes"); var name = bookNodes.FirstOrDefault()?.SelectSingleNode("./a")?.InnerText.Trim() ?? throw new DataSource.FormatChangedException(nameof(Amazon), "book results - title"); //Exclude the current book title if (name.ContainsIgnorecase(curTitle) || name.ContainsIgnorecase(@"(Series|Reading) Order|Checklist|Edition|eSpecial|\([0-9]+ Book Series\)")) { continue; } // Get Kindle ASIN var asin = ""; foreach (var bookNode in bookNodes) { var match = Regex.Match(bookNode.OuterHtml, "(dp/(?<asin>B[A-Z0-9]{9})/|/gp/product/(?<asin>B[A-Z0-9]{9}))", RegexOptions.Compiled); if (!match.Success) { continue; } asin = match.Groups["asin"].Value; break; } // TODO: This should be removable when the Kindle Only page is parsed instead if (asin == "") { continue; //throw new DataSource.FormatChangedException(nameof(Amazon), "book results - kindle edition asin"); } bookList.Add(new BookInfo(name, curAuthor, asin) { amazonUrl = $"https://www.amazon.{TLD}/dp/{asin}" }); } return(bookList); }
// TODO: Review this... public async Task <bool> Generate() { string outputDir; try { if (_settings.Android) { outputDir = _settings.OutDir + @"\Android\" + _curBook.asin; Directory.CreateDirectory(outputDir); } else { outputDir = _settings.UseSubDirectories ? Functions.GetBookOutputDirectory(_curBook.author, _curBook.sidecarName, true) : _settings.OutDir; } } catch (Exception ex) { Logger.Log("An error occurred creating output directory: " + ex.Message + "\r\nFiles will be placed in the default output directory."); outputDir = _settings.OutDir; } string ApPath = outputDir + @"\AuthorProfile.profile." + _curBook.asin + ".asc"; if (!Properties.Settings.Default.overwrite && File.Exists(ApPath)) { Logger.Log("AuthorProfile file already exists... Skipping!\r\n" + "Please review the settings page if you want to overwite any existing files."); return(false); } DataSources.AuthorSearchResults searchResults = null; // Attempt to download from the alternate site, if present. If it fails in some way, try .com // If the .com search crashes, it will crash back to the caller in frmMain try { searchResults = await DataSources.Amazon.SearchAuthor(_curBook, _settings.AmazonTld); } catch (Exception ex) { Logger.Log("Error searching Amazon." + _settings.AmazonTld + ": " + ex.Message + "\r\n" + ex.StackTrace); } finally { if (searchResults == null) { Logger.Log(String.Format("Failed to find {0} on Amazon." + _settings.AmazonTld, _curBook.author)); if (_settings.AmazonTld != "com") { Logger.Log("Trying again with Amazon.com."); _settings.AmazonTld = "com"; searchResults = await DataSources.Amazon.SearchAuthor(_curBook, _settings.AmazonTld); } } } if (searchResults == null) { return(false); // Already logged error in search function } authorAsin = searchResults.authorAsin; if (Properties.Settings.Default.saveHtml) { try { Logger.Log("Saving author's Amazon webpage..."); File.WriteAllText(Environment.CurrentDirectory + String.Format(@"\dmp\{0}.authorpageHtml.txt", _curBook.asin), searchResults.authorHtmlDoc.DocumentNode.InnerHtml); } catch (Exception ex) { Logger.Log(String.Format("An error occurred saving authorpageHtml.txt: {0}", ex.Message)); } } // Try to find author's biography string bioFile = Environment.CurrentDirectory + @"\ext\" + authorAsin + ".bio"; if (_settings.SaveBio && File.Exists(bioFile)) { if (!readBio(bioFile)) { return(false); } } if (BioTrimmed == "") { // TODO: Let users edit bio in same style as chapters and aliases HtmlNode bio = DataSources.Amazon.GetBioNode(searchResults, _settings.AmazonTld); //Trim authour biography to less than 1000 characters and/or replace more problematic characters. if (bio?.InnerText.Trim().Length > 0) { if (bio.InnerText.Length > 1000) { int lastPunc = bio.InnerText.LastIndexOfAny(new [] { '.', '!', '?' }); int lastSpace = bio.InnerText.LastIndexOf(' '); if (lastPunc > lastSpace) { BioTrimmed = bio.InnerText.Substring(0, lastPunc + 1); } else { BioTrimmed = bio.InnerText.Substring(0, lastSpace) + '\u2026'; } } else { BioTrimmed = bio.InnerText; } BioTrimmed = BioTrimmed.Clean(); Logger.Log("Author biography found on Amazon!"); } } else { File.WriteAllText(bioFile, String.Empty); if (System.Windows.Forms.DialogResult.Yes == System.Windows.Forms.MessageBox.Show( "No author biography found on Amazon!\r\nWould you like to create a biography?", "Biography", System.Windows.Forms.MessageBoxButtons.YesNo, System.Windows.Forms.MessageBoxIcon.Question, System.Windows.Forms.MessageBoxDefaultButton.Button2)) { Functions.RunNotepad(bioFile); if (!readBio(bioFile)) { return(false); } } else { BioTrimmed = "No author biography found on Amazon!"; Logger.Log("An error occurred finding the author biography on Amazon."); } } if (_settings.SaveBio) { if (!File.Exists(bioFile)) { try { Logger.Log("Saving biography to " + bioFile); using (var streamWriter = new StreamWriter(bioFile, false, System.Text.Encoding.UTF8)) { streamWriter.Write(BioTrimmed); } } catch (Exception ex) { Logger.Log("An error occurred while writing biography.\r\n" + ex.Message + "\r\n" + ex.StackTrace); return(false); } } if (System.Windows.Forms.DialogResult.Yes == System.Windows.Forms.MessageBox.Show("Would you like to open the biography file in notepad for editing?", "Biography", System.Windows.Forms.MessageBoxButtons.YesNo, System.Windows.Forms.MessageBoxIcon.Question, System.Windows.Forms.MessageBoxDefaultButton.Button2)) { Functions.RunNotepad(bioFile); if (!readBio(bioFile)) { return(false); } } } // Try to download Author image HtmlNode imageXpath = DataSources.Amazon.GetAuthorImageNode(searchResults, _settings.AmazonTld); authorImageUrl = Regex.Replace(imageXpath.GetAttributeValue("src", ""), @"_.*?_\.", string.Empty); // cleanup to match retail file image links if (authorImageUrl.Contains(@"https://images-na.ssl-images-amazon")) { authorImageUrl = authorImageUrl.Replace(@"https://images-na.ssl-images-amazon", @"http://ecx.images-amazon"); } _curBook.authorImageUrl = authorImageUrl; Bitmap ApAuthorImage; try { Logger.Log("Downloading author image..."); ApAuthorImage = await HttpDownloader.GetImage(authorImageUrl); Logger.Log("Grayscale base64-encoded author image created!"); } catch (Exception ex) { Logger.Log(String.Format("An error occurred downloading the author image: {0}", ex.Message)); return(false); } Logger.Log("Gathering author's other books..."); var bookList = DataSources.Amazon.GetAuthorBooks(searchResults, _curBook.title, _curBook.author, _settings.AmazonTld) ?? DataSources.Amazon.GetAuthorBooksNew(searchResults, _curBook.title, _curBook.author, _settings.AmazonTld); if (bookList != null) { Logger.Log("Gathering metadata for other books..."); var bookBag = new ConcurrentBag <BookInfo>(); await bookList.ParallelForEachAsync(async book => { // TODO: retry a couple times if one fails maybe try { //Gather book desc, image url, etc, if using new format if (_settings.UseNewVersion) { await book.GetAmazonInfo(book.amazonUrl); } bookBag.Add(book); } catch (Exception ex) { Logger.Log(String.Format("An error occurred gathering metadata for other books: {0}\r\nURL: {1}\r\nBook: {2}", ex.Message, book.amazonUrl, book.title)); throw; } }); otherBooks.AddRange(bookBag); } else { Logger.Log("Unable to find other books by this author. If there should be some, check the Amazon URL to ensure it is correct."); } Logger.Log("Writing Author Profile to file..."); var authorOtherBooks = otherBooks.Select(book => new Model.AuthorProfile.Book { E = 1, Asin = book.asin, Title = book.title }).ToArray(); var ap = new Model.AuthorProfile { Asin = _curBook.asin, CreationDate = Functions.UnixTimestampSeconds(), OtherBooks = authorOtherBooks, Authors = new [] { new Model.AuthorProfile.Author { Asin = authorAsin, Bio = BioTrimmed, ImageHeight = ApAuthorImage.Height, Name = _curBook.author, OtherBookAsins = otherBooks.Select(book => book.asin).ToArray(), Picture = Functions.ImageToBase64(ApAuthorImage, ImageFormat.Jpeg) } } }; string authorProfileOutput = JsonConvert.SerializeObject(ap); try { File.WriteAllText(ApPath, authorProfileOutput); Logger.Log("Author Profile file created successfully!\r\nSaved to " + ApPath); } catch (Exception ex) { Logger.Log("An error occurred while writing the Author Profile file: " + ex.Message + "\r\n" + ex.StackTrace); return(false); } ApTitle = "About " + _curBook.author; ApSubTitle = "Kindle Books By " + _curBook.author; EaSubTitle = "More Books By " + _curBook.author; return(true); }
public static async Task <AuthorSearchResults> SearchAuthor(BookInfo curBook, string TLD) { AuthorSearchResults results = new AuthorSearchResults(); //Generate Author search URL from author's name string newAuthor = Functions.FixAuthor(curBook.author); string plusAuthorName = newAuthor.Replace(" ", "+"); //Updated to match Search "all" Amazon string amazonAuthorSearchUrl = $"https://www.amazon.{TLD}/s/ref=nb_sb_noss_2?url=search-alias%3Dstripbooks&field-keywords={plusAuthorName}"; Logger.Log($"Searching for author's page on Amazon.{TLD}..."); // Search Amazon for Author results.authorHtmlDoc = new HtmlDocument { OptionAutoCloseOnEnd = true }; results.authorHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(amazonAuthorSearchUrl)); if (Properties.Settings.Default.saveHtml) { try { Logger.Log("Saving Amazon's author search webpage..."); File.WriteAllText(Environment.CurrentDirectory + $"\\dmp\\{curBook.asin}.authorsearchHtml.txt", results.authorHtmlDoc.DocumentNode.InnerHtml); } catch (Exception ex) { Logger.Log(String.Format("An error ocurred saving authorsearchHtml.txt: {0}", ex.Message)); } } // Check for captcha // TODO: Try to prompt for captcha and have user complete it to continue if (results.authorHtmlDoc.DocumentNode.InnerText.Contains("Robot Check")) { Logger.Log($"Warning: Amazon.{TLD} is requesting a captcha." + $"You can try visiting Amazon.{TLD} in a real browser first, try another region, or try again later."); } // Try to find Author's page from Amazon search HtmlNode node = results.authorHtmlDoc.DocumentNode.SelectSingleNode("//*[@id='result_1']"); if (node == null || !node.OuterHtml.Contains("/e/B")) { Logger.Log($"An error occurred finding author's page on Amazon.{TLD}." + "\r\nUnable to create Author Profile." + "\r\nEnsure the author metadata field matches the author's name exactly." + $"\r\nSearch results can be viewed at {amazonAuthorSearchUrl}"); return(null); } string properAuthor = ""; // Check for typical search results, second item is the author page if ((node = node.SelectSingleNode("//*[@id='result_1']/div/div/div/div/a")) != null) { properAuthor = node.GetAttributeValue("href", ""); results.authorAsin = node.GetAttributeValue("data-asin", null) ?? AsinFromUrl(properAuthor); } // otherwise check for "by so-and-so" text beneath the titles for a possible match else if ((node = results.authorHtmlDoc.DocumentNode.SelectSingleNode($"//div[@id='resultsCol']//li[@class='s-result-item celwidget ']//a[text()=\"{newAuthor}\"]")) != null) { properAuthor = node.GetAttributeValue("href", ""); results.authorAsin = AsinFromUrl(properAuthor); } if (string.IsNullOrEmpty(properAuthor) || properAuthor.IndexOf('/', 1) < 3 || results.authorAsin == "") { Logger.Log("Unable to parse author's page URL properly. Try again later or report this URL on the MobileRead thread: " + amazonAuthorSearchUrl); return(null); } properAuthor = properAuthor.Substring(1, properAuthor.IndexOf('/', 1) - 1); string authorAmazonWebsiteLocationLog = @"https://www.amazon." + TLD + "/" + properAuthor + "/e/" + results.authorAsin; string authorAmazonWebsiteLocation = @"https://www.amazon." + TLD + "/" + properAuthor + "/e/" + results.authorAsin + "/ref=la_" + results.authorAsin + "_rf_p_n_feature_browse-b_2?fst=as%3Aoff&rh=n%3A283155%2Cp_82%3A" + results.authorAsin + "%2Cp_n_feature_browse-bin%3A618073011&bbn=283155&ie=UTF8&qid=1432378570&rnid=618072011"; curBook.authorAsin = results.authorAsin; Logger.Log($"Author page found on Amazon!\r\nAuthor's Amazon Page URL: {authorAmazonWebsiteLocationLog}"); // Load Author's Amazon page string authorpageHtml; try { authorpageHtml = await HttpDownloader.GetPageHtmlAsync(authorAmazonWebsiteLocation); } catch { // If page not found (on co.uk at least, the long form does not seem to work) fallback to short form // and pray the formatting/item display suits our needs. If short form not found, crash back to caller. authorpageHtml = await HttpDownloader.GetPageHtmlAsync(authorAmazonWebsiteLocationLog); } results.authorHtmlDoc.LoadHtml(authorpageHtml); return(results); }
public static List <BookInfo> GetAuthorBooks(AuthorSearchResults searchResults, string curTitle, string curAuthor, string TLD) { HtmlNodeCollection resultsNodes = searchResults.authorHtmlDoc.DocumentNode.SelectNodes("//div[@id='mainResults']/ul/li"); if (resultsNodes == null) { return(null); } List <BookInfo> bookList = new List <BookInfo>(resultsNodes.Count); foreach (HtmlNode result in resultsNodes) { if (!result.Id.StartsWith("result_")) { continue; } string asin = ""; HtmlNode otherBook = result.SelectSingleNode(".//div[@class='a-row a-spacing-small']/a/h2"); if (otherBook == null) { continue; } //Exclude the current book title from other books search if (Regex.Match(otherBook.InnerText, curTitle, RegexOptions.IgnoreCase).Success || Regex.Match(otherBook.InnerText, @"(Series|Reading) Order|Checklist|Edition|eSpecial|\([0-9]+ Book Series\)", RegexOptions.IgnoreCase).Success) { continue; } var name = otherBook.InnerText.Trim(); otherBook = result.SelectSingleNode(".//*[@title='Kindle Edition']"); Match match = Regex.Match(otherBook.OuterHtml, "dp/(B[A-Z0-9]{9})/"); if (match.Success) { asin = match.Groups[1].Value; } var url = $"https://www.amazon.{TLD}/dp/{asin}"; if (name != "" && url != "" && asin != "") { BookInfo newBook = new BookInfo(name, curAuthor, asin) { amazonUrl = url }; bookList.Add(newBook); } } // If no kindle books returned, try the top carousel if (bookList.Count == 0) { resultsNodes = searchResults.authorHtmlDoc.DocumentNode.SelectNodes("//ol[@class='a-carousel' and @role ='list']/li"); if (resultsNodes == null) { return(null); } foreach (HtmlNode result in resultsNodes) { string asin = ""; HtmlNode otherBook = result.SelectSingleNode(".//a/img"); if (otherBook == null) { continue; } var name = otherBook.GetAttributeValue("alt", ""); //Exclude the current book title from other books search if (Regex.Match(name, curTitle, RegexOptions.IgnoreCase).Success || Regex.Match(name, @"(Series|Reading) Order|Checklist|Edition|eSpecial|\([0-9]+ Book Series\)", RegexOptions.IgnoreCase).Success) { continue; } otherBook = result.SelectSingleNode(".//a"); if (otherBook == null) { continue; } Match match = Regex.Match(otherBook.OuterHtml, "dp/(B[A-Z0-9]{9})/"); if (match.Success) { asin = match.Groups[1].Value; } var url = $"https://www.amazon.{TLD}/dp/{asin}"; if (name != "" && url != "" && asin != "") { BookInfo newBook = new BookInfo(name, curAuthor, asin) { amazonUrl = url }; bookList.Add(newBook); } } } return(bookList); }
public static HtmlNode GetAuthorImageNode(AuthorSearchResults searchResults, string TLD) { return(searchResults.authorHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='ap-image']/img") ?? searchResults.authorHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='authorImage']/img") ?? throw new DataSource.FormatChangedException(nameof(Amazon), "author image")); }
// Get biography from results page; TLD included in case different Amazon sites have different formatting public static HtmlNode GetBioNode(AuthorSearchResults searchResults, string TLD) { return(searchResults.authorHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='ap-bio' and @class='a-row']/div/div/span") ?? searchResults.authorHtmlDoc.DocumentNode.SelectSingleNode("//span[@id='author_biography']") ?? throw new DataSource.FormatChangedException(nameof(Amazon), "author bio")); }