HttpDownloader.GetPageHtmlAsync C# (CSharp) Code-Beispiele

Beispiel #1

0

Datei anzeigen

Datei: Shelfari.cs Projekt: iyedg/xray-builder.gui

        public override async Task <List <NotableClip> > GetNotableClips(string url, CancellationToken token, HtmlDocument srcDoc = null, IProgressBar progress = null)
        {
            if (srcDoc == null)
            {
                srcDoc = new HtmlDocument();
                srcDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(url));
            }
            List <NotableClip> result     = new List <NotableClip>();
            HtmlNodeCollection quoteNodes = sourceHtmlDoc.DocumentNode.SelectNodes("//div[@id='WikiModule_Quotations']/div/ul[@class='li_6']/li");

            if (quoteNodes != null)
            {
                foreach (HtmlNode quoteNode in quoteNodes)
                {
                    HtmlNode node = quoteNode.SelectSingleNode(".//blockquote");
                    if (node == null)
                    {
                        continue;
                    }
                    string quote = node.InnerText;
                    // Remove quotes (sometimes people put unnecessary quotes in the quote as well)
                    quote = Regex.Replace(quote, "^(&ldquo;){1,2}", "");
                    quote = Regex.Replace(quote, "(&rdquo;){1,2}$", "");
                    result.Add(new NotableClip {
                        Text = quote, Likes = 0
                    });
                }
            }
            return(result);
        }

Beispiel #2

0

Datei anzeigen

        public override async Task <bool> GetPageCount(BookInfo curBook)
        {
            if (sourceHtmlDoc == null)
            {
                sourceHtmlDoc = new HtmlDocument();
                sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(curBook.dataUrl));
            }
            HtmlNode pagesNode = sourceHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='details']");

            if (pagesNode == null)
            {
                return(false);
            }
            Match match = Regex.Match(pagesNode.InnerText, @"((\d+)|(\d+,\d+)) pages");

            if (match.Success)
            {
                double   minutes = int.Parse(match.Groups[1].Value, NumberStyles.AllowThousands) * 1.2890625;
                TimeSpan span    = TimeSpan.FromMinutes(minutes);
                Logger.Log(String.Format("Typical time to read: {0} hours and {1} minutes ({2} pages)", span.Hours, span.Minutes, match.Groups[1].Value));
                curBook.pagesInBook    = int.Parse(match.Groups[1].Value);
                curBook.readingHours   = span.Hours;
                curBook.readingMinutes = span.Minutes;
                return(true);
            }
            return(false);
        }

Beispiel #3

0

Datei anzeigen

Datei: Shelfari.cs Projekt: iyedg/xray-builder.gui

        public override async Task <bool> GetPageCount(BookInfo curBook)
        {
            if (sourceHtmlDoc == null)
            {
                sourceHtmlDoc = new HtmlDocument();
                sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(curBook.dataUrl));
            }
            HtmlNode pageNode = sourceHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='WikiModule_FirstEdition']");
            HtmlNode node1    = pageNode?.SelectSingleNode(".//div/div");

            if (node1 == null)
            {
                return(false);
            }
            //Parse page count and multiply by average reading time
            Match match1 = Regex.Match(node1.InnerText, @"Page Count: ((\d+)|(\d+,\d+))");

            if (match1.Success)
            {
                double   minutes = int.Parse(match1.Groups[1].Value, NumberStyles.AllowThousands) * 1.2890625;
                TimeSpan span    = TimeSpan.FromMinutes(minutes);
                Logger.Log(String.Format("Typical time to read: {0} hours and {1} minutes ({2} pages)", span.Hours, span.Minutes, match1.Groups[1].Value));
                curBook.pagesInBook    = int.Parse(match1.Groups[1].Value);
                curBook.readingHours   = span.Hours;
                curBook.readingMinutes = span.Minutes;
                return(true);
            }
            return(false);
        }

Beispiel #4

0

Datei anzeigen

        /// <summary>
        /// Scrape any notable quotes from Goodreads and grab ratings if missing from book info
        /// Modifies curBook.
        /// </summary>
        public override async Task GetExtras(BookInfo curBook, CancellationToken token, IProgressBar progress = null)
        {
            if (sourceHtmlDoc == null)
            {
                sourceHtmlDoc = new HtmlDocument();
                sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(curBook.dataUrl));
            }

            if (curBook.notableClips == null)
            {
                curBook.notableClips = await GetNotableClips("", token, sourceHtmlDoc, progress).ConfigureAwait(false);
            }

            //Add rating and reviews count if missing from Amazon book info
            HtmlNode metaNode = sourceHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='bookMeta']");

            if (metaNode != null && curBook.amazonRating == 0)
            {
                HtmlNode goodreadsRating = metaNode.SelectSingleNode("//span[@class='value rating']");
                if (goodreadsRating != null)
                {
                    curBook.amazonRating = float.Parse(goodreadsRating.InnerText);
                }
                HtmlNode passagesNode = metaNode.SelectSingleNode(".//a[@class='actionLinkLite votes' and @href='#other_reviews']")
                                        ?? metaNode.SelectSingleNode(".//span[@class='count value-title']");
                if (passagesNode != null)
                {
                    Match match = Regex.Match(passagesNode.InnerText, @"(\d+|\d{1,3}([,\.]\d{3})*)(?=\s)");
                    if (match.Success)
                    {
                        curBook.numReviews = int.Parse(match.Value.Replace(",", "").Replace(".", ""));
                    }
                }
            }
        }

Beispiel #5

0

Datei anzeigen

        public override async Task <List <BookInfo> > SearchBook(string author, string title)
        {
            var goodreadsSearchUrlBase = @"https://www.goodreads.com/search?q={0}%20{1}";

            title  = Uri.EscapeDataString(title);
            author = Uri.EscapeDataString(Functions.FixAuthor(author));

            var goodreadsHtmlDoc = new HtmlDocument();

            goodreadsHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(String.Format(goodreadsSearchUrlBase, author, title)));
            return(!goodreadsHtmlDoc.DocumentNode.InnerText.Contains("No results")
                ? ParseSearchResults(goodreadsHtmlDoc)
                : null);
        }

Beispiel #6

0

Datei anzeigen

        public override async Task <List <XRay.Term> > GetTerms(string dataUrl, IProgressBar progress, CancellationToken token)
        {
            if (sourceHtmlDoc == null)
            {
                Logger.Log("Downloading Goodreads page...");
                sourceHtmlDoc = new HtmlDocument();
                sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(dataUrl));
            }

            var charNodes = sourceHtmlDoc.DocumentNode.SelectNodes("//div[@class='infoBoxRowTitle' and text()='Characters']/../div[@class='infoBoxRowItem']/a");

            if (charNodes == null)
            {
                return(new List <XRay.Term>());
            }
            // Check if ...more link exists on Goodreads page
            var moreCharNodes = sourceHtmlDoc.DocumentNode.SelectNodes("//div[@class='infoBoxRowTitle' and text()='Characters']/../div[@class='infoBoxRowItem']/span[@class='toggleContent']/a");
            var allChars      = moreCharNodes == null ? charNodes : charNodes.Concat(moreCharNodes);
            var termCount     = moreCharNodes == null ? charNodes.Count : charNodes.Count + moreCharNodes.Count;

            Logger.Log($"Gathering term information from Goodreads... ({termCount})");
            progress?.Set(0, termCount);
            if (termCount > 20)
            {
                Logger.Log("More than 20 characters found. Consider using the 'download to XML' option if you need to build repeatedly.");
            }
            var terms = new ConcurrentBag <XRay.Term>();
            await allChars.ParallelForEachAsync(async charNode =>
            {
                try
                {
                    terms.AddNotNull(await GetTerm(dataUrl, charNode.GetAttributeValue("href", "")).ConfigureAwait(false));
                    progress?.Add(1);
                }
                catch (Exception ex)
                {
                    if (ex.Message.Contains("(404)"))
                    {
                        Logger.Log("Error getting page for character. URL: " + "https://www.goodreads.com" + charNode.GetAttributeValue("href", "")
                                   + "\r\nMessage: " + ex.Message + "\r\n" + ex.StackTrace);
                    }
                }
            }, MaxConcurrentRequests, token);

            return(terms.ToList());
        }

Beispiel #7

0

Datei anzeigen

        // Search Goodread for possible kindle edition of book and return ASIN.
        private async Task <string> SearchBookASIN(string id, string title)
        {
            string goodreadsBookUrl = String.Format("https://www.goodreads.com/book/show/{0}", id);

            try
            {
                HtmlDocument bookHtmlDoc = new HtmlDocument {
                    OptionAutoCloseOnEnd = true
                };
                bookHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(goodreadsBookUrl));
                HtmlNode link  = bookHtmlDoc.DocumentNode.SelectSingleNode("//div[@class='otherEditionsActions']/a");
                Match    match = Regex.Match(link.GetAttributeValue("href", ""), @"editions/([0-9]*)-");
                if (match.Success)
                {
                    string kindleEditionsUrl = String.Format("https://www.goodreads.com/work/editions/{0}?utf8=%E2%9C%93&sort=num_ratings&filter_by_format=Kindle+Edition", match.Groups[1].Value);
                    bookHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(kindleEditionsUrl));
                    HtmlNodeCollection bookNodes = bookHtmlDoc.DocumentNode.SelectNodes("//div[@class='elementList clearFix']");
                    if (bookNodes != null)
                    {
                        foreach (HtmlNode book in bookNodes)
                        {
                            match = Regex.Match(book.InnerHtml, "(B[A-Z0-9]{9})");
                            if (match.Success)
                            {
                                return(match.Value);
                            }
                        }
                    }
                }
                return("");
            }
            catch (Exception ex)
            {
                Logger.Log(String.Format("An error occurred while searching for {0}s ASIN.\r\n", title) + ex.Message + "\r\n" + ex.StackTrace);
                return("");
            }
        }

Beispiel #8

0

Datei anzeigen

        // TODO: All calls to Amazon should check for the captcha page (or ideally avoid it somehow)
        public static async Task <BookInfo> SearchBook(string title, string author, string TLD)
        {
            BookInfo result = null;

            if (title.IndexOf(" (") >= 0)
            {
                title = title.Substring(0, title.IndexOf(" ("));
            }
            //Search "all" Amazon
            string searchUrl = String.Format(@"https://www.amazon.{0}/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords={1}",
                                             TLD, Uri.EscapeDataString(title + " " + author));
            HtmlDocument searchDoc = new HtmlDocument();

            searchDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(searchUrl));
            HtmlNode node     = searchDoc.DocumentNode.SelectSingleNode("//li[@id='result_0']");
            HtmlNode nodeASIN = node?.SelectSingleNode(".//a[@title='Kindle Edition']");

            if (nodeASIN == null)
            {
                node     = searchDoc.DocumentNode.SelectSingleNode("//li[@id='result_1']");
                nodeASIN = node?.SelectSingleNode(".//a[@title='Kindle Edition']");
            }
            //At least attempt to verify it might be the same book?
            if (node != null && nodeASIN != null && node.InnerText.IndexOf(title, StringComparison.OrdinalIgnoreCase) >= 0)
            {
                Match foundASIN = Regex.Match(nodeASIN.OuterHtml, "(B[A-Z0-9]{9})");
                node = node.SelectSingleNode(".//div/div/div/div[@class='a-fixed-left-grid-col a-col-right']/div/a");
                if (node != null)
                {
                    result = new BookInfo(node.InnerText, author, foundASIN.Value);
                    string trimUrl = nodeASIN.GetAttributeValue("href", "");
                    trimUrl          = trimUrl.Substring(0, trimUrl.IndexOf(foundASIN.Value) + foundASIN.Length);
                    result.amazonUrl = trimUrl; // Grab the true link for good measure
                }
            }
            return(result);
        }

Beispiel #9

0

Datei anzeigen

        // Are there actually any goodreads pages that aren't at goodreads.com for other languages??
        private async Task <XRay.Term> GetTerm(string baseUrl, string relativeUrl)
        {
            XRay.Term result  = new XRay.Term("character");
            Uri       tempUri = new Uri(baseUrl);

            tempUri        = new Uri(new Uri(tempUri.GetLeftPart(UriPartial.Authority)), relativeUrl);
            result.DescSrc = "Goodreads";
            result.DescUrl = tempUri.ToString();
            HtmlDocument charDoc = new HtmlDocument();

            charDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(tempUri.ToString()));
            HtmlNode mainNode = charDoc.DocumentNode.SelectSingleNode("//div[@class='mainContentFloat']")
                                ?? charDoc.DocumentNode.SelectSingleNode("//div[@class='mainContentFloat ']");

            result.TermName = mainNode.SelectSingleNode("./h1").InnerText;
            mainNode        = mainNode.SelectSingleNode("//div[@class='grey500BoxContent']");
            HtmlNodeCollection tempNodes = mainNode.SelectNodes("//div[@class='floatingBox']");

            if (tempNodes == null)
            {
                return(result);
            }
            foreach (HtmlNode tempNode in tempNodes)
            {
                if (tempNode.Id.Contains("_aliases")) // If present, add any aliases found
                {
                    string aliasStr = tempNode.InnerText.Replace("[close]", "").Trim();
                    result.Aliases.AddRange(aliasStr.Split(new [] { ", " }, StringSplitOptions.RemoveEmptyEntries));
                }
                else
                {
                    result.Desc = tempNode.InnerText.Replace("[close]", "").Trim();
                }
            }
            return(result);
        }

Beispiel #10

0

Datei anzeigen

        /// <summary>
        /// Gather the list of quotes & number of times they've been liked -- close enough to "x paragraphs have been highlighted y times" from Amazon
        /// </summary>
        public override async Task <List <NotableClip> > GetNotableClips(string url, CancellationToken token, HtmlDocument srcDoc = null, IProgressBar progress = null)
        {
            if (srcDoc == null)
            {
                srcDoc = new HtmlDocument();
                srcDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(url));
            }
            HtmlNode quoteNode = srcDoc.DocumentNode.SelectSingleNode("//div[@class='h2Container gradientHeaderContainer']/h2/a[starts-with(.,'Quotes from')]");

            if (quoteNode == null)
            {
                return(null);
            }
            string quoteURL = $"https://www.goodreads.com{quoteNode.GetAttributeValue("href", "")}?page={{0}}";

            progress?.Set(0, 1);

            var quoteBag    = new ConcurrentBag <IEnumerable <NotableClip> >();
            var initialPage = new HtmlDocument();

            initialPage.LoadHtml(await HttpDownloader.GetPageHtmlAsync(string.Format(quoteURL, 1)));

            // check how many pages there are (find previous page button, get parent div, take all children of that, 2nd last one should be the max page count
            HtmlNode maxPageNode = initialPage.DocumentNode.SelectSingleNode("//span[contains(@class,'previous_page')]/parent::div/*[last()-1]");

            if (maxPageNode == null)
            {
                return(null);
            }
            if (!int.TryParse(maxPageNode.InnerHtml, out var maxPages))
            {
                maxPages = 1;
            }

            IEnumerable <NotableClip> ParseQuotePage(HtmlDocument quoteDoc)
            {
                HtmlNodeCollection tempNodes = quoteDoc.DocumentNode.SelectNodes("//div[@class='quotes']/div[@class='quote']");

                return(tempNodes?.Select(node =>
                {
                    var quoteMatch = Regex.Match(node.InnerText, "&ldquo;(.*?)&rdquo;", RegexOptions.Compiled);
                    var likesMatch = Regex.Match(node.SelectSingleNode(".//div[@class='right']/a")?.InnerText ?? "",
                                                 @"(\d+) likes", RegexOptions.Compiled);
                    if (!quoteMatch.Success || !likesMatch.Success)
                    {
                        return null;
                    }
                    return new NotableClip
                    {
                        Text = quoteMatch.Groups[1].Value,
                        Likes = int.Parse(likesMatch.Groups[1].Value)
                    };
                }).Where(quote => quote != null));
            }

            quoteBag.Add(ParseQuotePage(initialPage));
            progress?.Set(1, maxPages);
            await Enumerable.Range(2, maxPages).ParallelForEachAsync(async page =>
            {
                var quotePage = new HtmlDocument();
                quotePage.LoadHtml(await HttpDownloader.GetPageHtmlAsync(string.Format(quoteURL, page)));
                quoteBag.Add(ParseQuotePage(quotePage));
                progress?.Add(1);
            }, MaxConcurrentRequests, token);

            return(quoteBag.Where(quotes => quotes != null && quotes.Any()).SelectMany(quotes => quotes).ToList());
        }

Beispiel #11

0

Datei anzeigen

Datei: Shelfari.cs Projekt: iyedg/xray-builder.gui

        public override async Task <BookInfo> GetNextInSeries(BookInfo curBook, AuthorProfile authorProfile, string TLD)
        {
            BookInfo nextBook = null;

            if (curBook.dataUrl == "")
            {
                return(null);
            }
            if (sourceHtmlDoc == null)
            {
                sourceHtmlDoc = new HtmlDocument();
                sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(curBook.dataUrl));
            }

            // Get title of next book
            Dictionary <string, string> seriesInfo = await GetNextInSeriesTitle(curBook).ConfigureAwait(false);

            if (seriesInfo.TryGetValue("Next", out var title))
            {
                // Search author's other books for the book (assumes next in series was written by the same author...)
                // Returns the first one found, though there should probably not be more than 1 of the same name anyway
                nextBook = authorProfile.otherBooks.FirstOrDefault(bk => bk.title == title);
                if (nextBook == null)
                {
                    // Attempt to search Amazon for the book instead
                    nextBook = await Amazon.SearchBook(title, curBook.author, TLD);

                    if (nextBook != null)
                    {
                        await nextBook.GetAmazonInfo(nextBook.amazonUrl); //fill in desc, imageurl, and ratings
                    }
                }
                // Try to fill in desc, imageurl, and ratings using Shelfari Kindle edition link instead
                if (nextBook == null)
                {
                    HtmlDocument bookDoc = new HtmlDocument {
                        OptionAutoCloseOnEnd = true
                    };
                    bookDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(seriesInfo["NextURL"]));
                    Match match = Regex.Match(bookDoc.DocumentNode.InnerHtml, "('B[A-Z0-9]{9}')");
                    if (match.Success)
                    {
                        string cleanASIN = match.Value.Replace("'", String.Empty);
                        nextBook = new BookInfo(title, curBook.author, cleanASIN);
                        await nextBook.GetAmazonInfo("http://www.amazon.com/dp/" + cleanASIN);
                    }
                }
                if (nextBook == null)
                {
                    Logger.Log("Book was found to be part of a series, but an error occured finding the next book.\r\n" +
                               "Please report this book and the Shelfari URL and output log to improve parsing.");
                }
            }
            else if (curBook.seriesPosition != curBook.totalInSeries.ToString())
            {
                Logger.Log("An error occured finding the next book in series, the book may not be part of a series, or it is the latest release.");
            }

            if (seriesInfo.TryGetValue("Previous", out title))
            {
                if (curBook.previousInSeries == null)
                {
                    // Attempt to search Amazon for the book
                    curBook.previousInSeries = await Amazon.SearchBook(title, curBook.author, TLD);

                    if (curBook.previousInSeries != null)
                    {
                        await curBook.previousInSeries.GetAmazonInfo(curBook.previousInSeries.amazonUrl); //fill in desc, imageurl, and ratings
                    }
                    // Try to fill in desc, imageurl, and ratings using Shelfari Kindle edition link instead
                    if (curBook.previousInSeries == null)
                    {
                        HtmlDocument bookDoc = new HtmlDocument()
                        {
                            OptionAutoCloseOnEnd = true
                        };
                        bookDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(seriesInfo["PreviousURL"]));
                        Match match = Regex.Match(bookDoc.DocumentNode.InnerHtml, "('B[A-Z0-9]{9}')");
                        if (match.Success)
                        {
                            string cleanASIN = match.Value.Replace("'", String.Empty);
                            curBook.previousInSeries = new BookInfo(title, curBook.author, cleanASIN);
                            await curBook.previousInSeries.GetAmazonInfo("http://www.amazon.com/dp/" + cleanASIN);
                        }
                    }
                }
                else
                {
                    Logger.Log("Book was found to be part of a series, but an error occured finding the next book.\r\n" +
                               "Please report this book and the Shelfari URL and output log to improve parsing.");
                }
            }

            return(nextBook);
        }

Beispiel #12

0

Datei anzeigen

Datei: Shelfari.cs Projekt: iyedg/xray-builder.gui

        public override async Task <List <XRay.Term> > GetTerms(string dataUrl, IProgressBar progress, CancellationToken token)
        {
            Logger.Log("Downloading Shelfari page...");
            List <XRay.Term> terms = new List <XRay.Term>();

            if (sourceHtmlDoc == null)
            {
                sourceHtmlDoc = new HtmlDocument();
                sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(dataUrl));
            }

            //Constants for wiki processing
            Dictionary <string, string> sections = new Dictionary <string, string>
            {
                { "WikiModule_Characters", "character" },
                { "WikiModule_Organizations", "topic" },
                { "WikiModule_Settings", "topic" },
                { "WikiModule_Glossary", "topic" }
            };

            foreach (string header in sections.Keys)
            {
                HtmlNodeCollection characterNodes =
                    sourceHtmlDoc.DocumentNode.SelectNodes("//div[@id='" + header + "']//ul[@class='li_6']/li");
                if (characterNodes == null)
                {
                    continue;                         //Skip section if not found on page
                }
                foreach (HtmlNode li in characterNodes)
                {
                    string    tmpString = li.InnerText;
                    XRay.Term newTerm   = new XRay.Term(sections[header]); //Create term as either character/topic
                    if (tmpString.Contains(":"))
                    {
                        newTerm.TermName = tmpString.Substring(0, tmpString.IndexOf(":"));
                        newTerm.Desc     = tmpString.Substring(tmpString.IndexOf(":") + 1).Replace("&amp;", "&").Trim();
                    }
                    else
                    {
                        newTerm.TermName = tmpString;
                    }
                    newTerm.DescSrc = "shelfari";
                    //Use either the associated shelfari URL of the term or if none exists, use the book's url
                    newTerm.DescUrl = (li.InnerHtml.IndexOf("<a href") == 0
                        ? li.InnerHtml.Substring(9, li.InnerHtml.IndexOf("\"", 9) - 9)
                        : dataUrl);
                    if (header == "WikiModule_Glossary")
                    {
                        newTerm.MatchCase = false;
                    }
                    //Default glossary terms to be case insensitive when searching through book
                    if (terms.Select(t => t.TermName).Contains(newTerm.TermName))
                    {
                        Logger.Log("Duplicate term \"" + newTerm.TermName + "\" found. Ignoring this duplicate.");
                    }
                    else
                    {
                        terms.Add(newTerm);
                    }
                }
            }
            return(terms);
        }

Beispiel #13

0

Datei anzeigen

Datei: Shelfari.cs Projekt: iyedg/xray-builder.gui

        /// <summary>
        /// Search Shelfari page for possible series info, returning the next title in the series without downloading any other pages.
        /// </summary>
        private async Task <Dictionary <string, string> > GetNextInSeriesTitle(BookInfo curBook)
        {
            if (curBook.dataUrl == "")
            {
                return(null);
            }
            if (sourceHtmlDoc == null)
            {
                sourceHtmlDoc = new HtmlDocument();
                sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(curBook.dataUrl));
            }
            Dictionary <string, string> results = new Dictionary <string, string>(2);
            //Added estimated reading time and page count from Shelfari, for now...
            HtmlNode pageNode = sourceHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='WikiModule_FirstEdition']");

            if (pageNode == null)
            {
                return(results);
            }
            HtmlNode node1 = pageNode.SelectSingleNode(".//div/div");

            if (node1 == null)
            {
                return(results);
            }

            //Check if book series is available and displayed in Series & Lists on Shelfari page.
            HtmlNode seriesNode = sourceHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='WikiModule_Series']/div");

            if (seriesNode != null)
            {
                //If multiple Series found, find and use standard series.
                foreach (HtmlNode seriesType in seriesNode.SelectNodes(".//div"))
                {
                    if (seriesType.InnerText.Contains("(standard series)", StringComparison.OrdinalIgnoreCase) && !seriesType.InnerText.Contains("(Reading Order)", StringComparison.OrdinalIgnoreCase))
                    {
                        Match match = Regex.Match(seriesType.InnerText, @"This is book (\d+) of (\d+) in (.+)\.");
                        if (!match.Success || match.Groups.Count != 4)
                        {
                            continue;
                        }

                        Logger.Log("About the series: " + seriesType.InnerText.Replace(". (standard series)", ""));
                        curBook.seriesPosition = match.Groups[1].Value;
                        curBook.totalInSeries  = int.Parse(match.Groups[2].Value);
                        curBook.seriesName     = match.Groups[3].Value;
                        HtmlNode seriesInfo = seriesNode.SelectSingleNode(".//p");
                        //Parse preceding book
                        if (seriesInfo != null && seriesInfo.InnerText.Contains("Preceded by ", StringComparison.OrdinalIgnoreCase))
                        {
                            match = Regex.Match(seriesInfo.InnerText, @"Preceded by (.*),", RegexOptions.IgnoreCase);
                            if (match.Success && match.Groups.Count == 2)
                            {
                                results["Previous"] = match.Groups[1].Value;
                            }
                            else
                            {
                                match = Regex.Match(seriesInfo.InnerText, @"Preceded by (.*)\.", RegexOptions.IgnoreCase);
                                if (match.Success && match.Groups.Count == 2)
                                {
                                    results["Previous"] = match.Groups[1].Value;
                                }
                            }
                            Logger.Log("Preceded by: " + match.Groups[1].Value);
                            //Grab Shelfari Kindle edition link for this book
                            results["PreviousURL"] = seriesInfo.ChildNodes["a"].GetAttributeValue("href", "") + "/editions?binding=Kindle";
                        }
                        // Check if book is the last in the series
                        if (!curBook.seriesPosition.Equals(curBook.totalInSeries))
                        {
                            //Parse following book
                            if (seriesInfo != null && seriesInfo.InnerText.Contains("followed by ", StringComparison.OrdinalIgnoreCase))
                            {
                                match = Regex.Match(seriesInfo.InnerText, @"followed by (.*)\.", RegexOptions.IgnoreCase);
                                if (match.Success && match.Groups.Count == 2)
                                {
                                    Logger.Log("Followed by: " + match.Groups[1].Value);
                                    //Grab Shelfari Kindle edition link for this book
                                    results["NextURL"] = seriesInfo.ChildNodes["a"].GetAttributeValue("href", "") + "/editions?binding=Kindle";
                                    results["Next"]    = match.Groups[1].Value;
                                    return(results);
                                }
                            }
                        }
                        break;
                    }
                }
            }
            return(results);
        }

Beispiel #14

0

Datei anzeigen

        public static async Task <NextBookResult> DownloadNextInSeries(string asin)
        {
            var response = await HttpDownloader.GetPageHtmlAsync($"https://www.revensoftware.com/amazon/next/{asin}");

            return(JsonConvert.DeserializeObject <NextBookResult>(response));
        }

Beispiel #15

0

Datei anzeigen

 public static Task <string> DownloadStartActions(string asin)
 => HttpDownloader.GetPageHtmlAsync($"https://www.revensoftware.com/amazon/sa/{asin}");

Beispiel #16

0

Datei anzeigen

        /// <summary>
        /// Search Goodread for possible series info, returning the next title in the series.
        /// Modifies curBook.
        /// </summary>
        private async Task <Dictionary <string, BookInfo> > GetNextInSeriesTitle(BookInfo curBook)
        {
            Dictionary <string, BookInfo> results = new Dictionary <string, BookInfo>(2);

            if (sourceHtmlDoc == null)
            {
                sourceHtmlDoc = new HtmlDocument();
                sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(curBook.dataUrl));
            }

            //Search Goodreads for series info
            string   goodreadsSeriesUrl = @"https://www.goodreads.com/series/{0}";
            HtmlNode metaNode           = sourceHtmlDoc.DocumentNode.SelectSingleNode("//div[@id='bookMeta']");
            HtmlNode seriesNode         = metaNode?.SelectSingleNode("//h1[@id='bookTitle']/a");

            if (seriesNode == null)
            {
                return(results);
            }
            var match = Regex.Match(seriesNode.OuterHtml, @"/series/([0-9]*)");

            if (!match.Success)
            {
                return(results);
            }
            goodreadsSeriesUrl = String.Format(goodreadsSeriesUrl, match.Groups[1].Value);
            match = Regex.Match(seriesNode.InnerText, @"\((.*) #?([0-9]*([.,][0-9])?)\)");
            if (match.Success)
            {
                Logger.Log($"Series Goodreads Page URL: {goodreadsSeriesUrl}");
                curBook.seriesName     = match.Groups[1].Value.Trim();
                curBook.seriesPosition = match.Groups[2].Value;
            }
            else
            {
                return(results);
            }

            HtmlDocument seriesHtmlDoc = new HtmlDocument {
                OptionAutoCloseOnEnd = true
            };

            seriesHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(goodreadsSeriesUrl));

            seriesNode = seriesHtmlDoc.DocumentNode.SelectSingleNode("//div[contains(@class, 'responsiveSeriesHeader__subtitle')]");
            match      = Regex.Match(seriesNode?.InnerText ?? "", @"([0-9]+) (?:primary )?works?");
            if (match.Success)
            {
                curBook.totalInSeries = int.Parse(match.Groups[1].Value);
            }

            int positionInt = (int)Convert.ToDouble(curBook.seriesPosition, CultureInfo.InvariantCulture.NumberFormat);
            int totalInt    = (int)Convert.ToDouble(curBook.totalInSeries, CultureInfo.InvariantCulture.NumberFormat);

            Logger.Log($"This is book {curBook.seriesPosition} of {curBook.totalInSeries} in the {curBook.seriesName} series");

            HtmlNodeCollection bookNodes  = seriesHtmlDoc.DocumentNode.SelectNodes("//div[@itemtype='http://schema.org/Book']");
            string             prevSearch = curBook.seriesPosition.Contains(".")
                ? $"book {positionInt}"
                : $"book {positionInt - 1}";
            string nextSearch = $"book {positionInt + 1}";

            if (bookNodes != null)
            {
                foreach (HtmlNode book in bookNodes)
                {
                    var bookIndex = book.SelectSingleNode(".//div[@class='responsiveBook__header']")?.InnerText.ToLower();
                    if (bookIndex == null)
                    {
                        continue;
                    }
                    // TODO: Combine these
                    if (results.Count == 0 && bookIndex == prevSearch)
                    {
                        BookInfo prevBook = new BookInfo("", "", "");
                        var      title    = book.SelectSingleNode(".//div[@class='u-paddingBottomXSmall']/a");
                        prevBook.title = Regex.Replace(title.InnerText.Trim(), @" \(.*\)", "", RegexOptions.Compiled);
                        match          = Regex.Match(title.GetAttributeValue("href", ""), @"show/([0-9]+)");
                        if (match.Success)
                        {
                            prevBook.asin = await SearchBookASIN(match.Groups[1].Value, prevBook.title).ConfigureAwait(false);
                        }
                        prevBook.author          = book.SelectSingleNode(".//span[@itemprop='author']//a")?.InnerText.Trim() ?? "";
                        results["Previous"]      = prevBook;
                        curBook.previousInSeries = prevBook;
                        Logger.Log($"Preceded by: {prevBook.title}");
                        continue;
                    }
                    if (bookIndex == nextSearch)
                    {
                        BookInfo nextBook = new BookInfo("", "", "");
                        var      title    = book.SelectSingleNode(".//div[@class='u-paddingBottomXSmall']/a");
                        nextBook.title = Regex.Replace(title.InnerText.Trim(), @" \(.*\)", "", RegexOptions.Compiled);
                        match          = Regex.Match(title.GetAttributeValue("href", ""), @"show/([0-9]+)");
                        if (match.Success)
                        {
                            nextBook.asin = await SearchBookASIN(match.Groups[1].Value, nextBook.title).ConfigureAwait(false);
                        }
                        nextBook.author      = book.SelectSingleNode(".//span[@itemprop='author']//a")?.InnerText.Trim() ?? "";
                        results["Next"]      = nextBook;
                        curBook.nextInSeries = nextBook;
                        Logger.Log($"Followed by: {nextBook.title}");
                    }
                    if (results.Count == 2 || results.Count == 1 && positionInt == totalInt)
                    {
                        break;                                                                      // next and prev found or prev found and latest in series
                    }
                }
            }
            return(results);
        }

Beispiel #17

0

Datei anzeigen

        /// <summary>
        /// Searches for the next and previous books in a series, if it is part of one.
        /// Modifies curBook.previousInSeries to contain the found book info.
        /// </summary>
        /// <returns>Next book in series</returns>
        public override async Task <BookInfo> GetNextInSeries(BookInfo curBook, AuthorProfile authorProfile, string TLD)
        {
            BookInfo nextBook = null;

            if (curBook.dataUrl == "")
            {
                return(null);
            }
            if (sourceHtmlDoc == null)
            {
                sourceHtmlDoc = new HtmlDocument();
                sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(curBook.dataUrl));
            }

            // Get title of next book
            Dictionary <string, BookInfo> seriesInfo = await GetNextInSeriesTitle(curBook);

            if (seriesInfo.TryGetValue("Next", out var book))
            {
                // TODO: next and previous sections are the same...
                // Search author's other books for the book (assumes next in series was written by the same author...)
                // Returns the first one found, though there should probably not be more than 1 of the same name anyway
                nextBook = authorProfile.otherBooks.FirstOrDefault(bk => Regex.IsMatch(bk.title, $@"^{book.title}(?: \(.*\))?$"));
                if (nextBook == null)
                {
                    // Attempt to search Amazon for the book instead
                    // TODO: This should be elsewhere
                    try
                    {
                        if (!string.IsNullOrEmpty(book.asin))
                        {
                            nextBook = book;
                            await nextBook.GetAmazonInfo($"https://www.amazon.{TLD}/dp/{book.asin}");
                        }
                        else
                        {
                            nextBook = await Amazon.SearchBook(book.title, book.author, TLD);
                        }

                        if (nextBook == null && settings.promptASIN)
                        {
                            Logger.Log($"ASIN prompt for {book.title}...");
                            nextBook            = new BookInfo(book.title, book.author, "");
                            frmAS.Text          = "Next in Series";
                            frmAS.lblTitle.Text = book.title;
                            frmAS.tbAsin.Text   = "";
                            frmAS.ShowDialog();
                            Logger.Log($"ASIN supplied: {frmAS.tbAsin.Text}");
                            string Url = $"https://www.amazon.{TLD}/dp/{frmAS.tbAsin.Text}";
                            await nextBook.GetAmazonInfo(Url);

                            nextBook.amazonUrl = Url;
                            nextBook.asin      = frmAS.tbAsin.Text;
                        }
                    }
                    catch
                    {
                        Logger.Log($"Failed to find {book.title} on Amazon.{TLD}, trying again with Amazon.com.");
                        TLD      = "com";
                        nextBook = await Amazon.SearchBook(book.title, book.author, TLD);
                    }

                    if (nextBook != null)
                    {
                        await nextBook.GetAmazonInfo(nextBook.amazonUrl); //fill in desc, imageurl, and ratings
                    }
                }

                if (nextBook == null)
                {
                    Logger.Log("Book was found to be part of a series, but an error occurred finding the next book.\r\n"
                               + "Please report this book and the Goodreads URL and output log to improve parsing (if it's a real book).");
                }
            }
            else if (curBook.totalInSeries == 0)
            {
                Logger.Log("The book was not found to be part of a series.");
            }
            else if (curBook.seriesPosition != curBook.totalInSeries.ToString() && !curBook.seriesPosition?.Contains(".") == true)
            {
                Logger.Log("An error occurred finding the next book in series. The book may not be part of a series, or it is the latest release.");
            }

            if (seriesInfo.TryGetValue("Previous", out book))
            {
                var prevBook = authorProfile.otherBooks.FirstOrDefault(bk => Regex.IsMatch(bk.title, $@"^{book.title}(?: \(.*\))?$"));
                if (book.asin != null)
                {
                    prevBook = book;
                    await prevBook.GetAmazonInfo($"https://www.amazon.{TLD}/dp/{book.asin}");
                }
                else if (prevBook != null)
                {
                    await prevBook.GetAmazonInfo(prevBook.amazonUrl);
                }
                if (prevBook == null && settings.promptASIN)
                {
                    Logger.Log($"ASIN prompt for {book.title}...");
                    prevBook            = new BookInfo(book.title, book.author, "");
                    frmAS.Text          = "Previous in Series";
                    frmAS.lblTitle.Text = book.title;
                    frmAS.tbAsin.Text   = "";
                    frmAS.ShowDialog();
                    Logger.Log($"ASIN supplied: {frmAS.tbAsin.Text}");
                    string Url = $"https://www.amazon.{TLD}/dp/{frmAS.tbAsin.Text}";
                    await prevBook.GetAmazonInfo(Url);

                    prevBook.amazonUrl = Url;
                    prevBook.asin      = frmAS.tbAsin.Text;
                }
                if (prevBook == null)
                {
                    Logger.Log("Book was found to be part of a series, but an error occurred finding the previous book.\r\n" +
                               "Please report this book and the Goodreads URL and output log to improve parsing.");
                }
            }
            return(nextBook);
        }

Beispiel #18

0

Datei anzeigen

        public static async Task <AuthorSearchResults> SearchAuthor(BookInfo curBook, string TLD)
        {
            AuthorSearchResults results = new AuthorSearchResults();
            //Generate Author search URL from author's name
            string newAuthor      = Functions.FixAuthor(curBook.author);
            string plusAuthorName = newAuthor.Replace(" ", "+");
            //Updated to match Search "all" Amazon
            string amazonAuthorSearchUrl = $"https://www.amazon.{TLD}/s/ref=nb_sb_noss_2?url=search-alias%3Dstripbooks&field-keywords={plusAuthorName}";

            Logger.Log($"Searching for author's page on Amazon.{TLD}...");

            // Search Amazon for Author
            results.authorHtmlDoc = new HtmlDocument {
                OptionAutoCloseOnEnd = true
            };
            results.authorHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(amazonAuthorSearchUrl));

            if (Properties.Settings.Default.saveHtml)
            {
                try
                {
                    Logger.Log("Saving Amazon's author search webpage...");
                    File.WriteAllText(Environment.CurrentDirectory + $"\\dmp\\{curBook.asin}.authorsearchHtml.txt",
                                      results.authorHtmlDoc.DocumentNode.InnerHtml);
                }
                catch (Exception ex)
                {
                    Logger.Log(String.Format("An error ocurred saving authorsearchHtml.txt: {0}", ex.Message));
                }
            }

            // Check for captcha
            // TODO: Try to prompt for captcha and have user complete it to continue
            if (results.authorHtmlDoc.DocumentNode.InnerText.Contains("Robot Check"))
            {
                Logger.Log($"Warning: Amazon.{TLD} is requesting a captcha."
                           + $"You can try visiting Amazon.{TLD} in a real browser first, try another region, or try again later.");
            }
            // Try to find Author's page from Amazon search
            HtmlNode node = results.authorHtmlDoc.DocumentNode.SelectSingleNode("//*[@id='result_1']");

            if (node == null || !node.OuterHtml.Contains("/e/B"))
            {
                Logger.Log($"An error occurred finding author's page on Amazon.{TLD}." +
                           "\r\nUnable to create Author Profile." +
                           "\r\nEnsure the author metadata field matches the author's name exactly." +
                           $"\r\nSearch results can be viewed at {amazonAuthorSearchUrl}");
                return(null);
            }

            string properAuthor = "";

            // Check for typical search results, second item is the author page
            if ((node = node.SelectSingleNode("//*[@id='result_1']/div/div/div/div/a")) != null)
            {
                properAuthor       = node.GetAttributeValue("href", "");
                results.authorAsin = node.GetAttributeValue("data-asin", null)
                                     ?? AsinFromUrl(properAuthor);
            }
            // otherwise check for "by so-and-so" text beneath the titles for a possible match
            else if ((node = results.authorHtmlDoc.DocumentNode.SelectSingleNode($"//div[@id='resultsCol']//li[@class='s-result-item celwidget  ']//a[text()=\"{newAuthor}\"]")) != null)
            {
                properAuthor       = node.GetAttributeValue("href", "");
                results.authorAsin = AsinFromUrl(properAuthor);
            }

            if (string.IsNullOrEmpty(properAuthor) || properAuthor.IndexOf('/', 1) < 3 || results.authorAsin == "")
            {
                Logger.Log("Unable to parse author's page URL properly. Try again later or report this URL on the MobileRead thread: " + amazonAuthorSearchUrl);
                return(null);
            }
            properAuthor = properAuthor.Substring(1, properAuthor.IndexOf('/', 1) - 1);
            string authorAmazonWebsiteLocationLog = @"https://www.amazon." + TLD + "/" + properAuthor + "/e/" + results.authorAsin;
            string authorAmazonWebsiteLocation    = @"https://www.amazon." + TLD + "/" + properAuthor + "/e/" + results.authorAsin +
                                                    "/ref=la_" + results.authorAsin +
                                                    "_rf_p_n_feature_browse-b_2?fst=as%3Aoff&rh=n%3A283155%2Cp_82%3A" +
                                                    results.authorAsin +
                                                    "%2Cp_n_feature_browse-bin%3A618073011&bbn=283155&ie=UTF8&qid=1432378570&rnid=618072011";

            curBook.authorAsin = results.authorAsin;
            Logger.Log($"Author page found on Amazon!\r\nAuthor's Amazon Page URL: {authorAmazonWebsiteLocationLog}");

            // Load Author's Amazon page
            string authorpageHtml;

            try
            {
                authorpageHtml = await HttpDownloader.GetPageHtmlAsync(authorAmazonWebsiteLocation);
            }
            catch
            {
                // If page not found (on co.uk at least, the long form does not seem to work) fallback to short form
                // and pray the formatting/item display suits our needs. If short form not found, crash back to caller.
                authorpageHtml = await HttpDownloader.GetPageHtmlAsync(authorAmazonWebsiteLocationLog);
            }
            results.authorHtmlDoc.LoadHtml(authorpageHtml);
            return(results);
        }

C# (CSharp) HttpDownloader.GetPageHtmlAsync Beispiele