Esempio n. 1
0
        /// <summary>
        /// Gather the list of quotes & number of times they've been liked -- close enough to "x paragraphs have been highlighted y times" from Amazon
        /// </summary>
        public async Task <IEnumerable <NotableClip> > GetNotableClipsAsync(string url, HtmlDocument srcDoc = null, IProgressBar progress = null, CancellationToken cancellationToken = default)
        {
            if (srcDoc == null)
            {
                srcDoc = await _httpClient.GetPageAsync(url, cancellationToken);
            }
            var quoteNode = srcDoc.DocumentNode.SelectSingleNode("//div[@class='h2Container gradientHeaderContainer']/h2/a[starts-with(.,'Quotes from')]");

            if (quoteNode == null)
            {
                return(null);
            }
            var quoteURL = $"https://www.goodreads.com{quoteNode.GetAttributeValue("href", "")}?page={{0}}";

            progress?.Set(0, 1);

            var quoteBag    = new ConcurrentBag <IEnumerable <NotableClip> >();
            var initialPage = await _httpClient.GetPageAsync(string.Format(quoteURL, 1), cancellationToken);

            // check how many pages there are (find previous page button, get parent div, take all children of that, 2nd last one should be the max page count
            var maxPageNode = initialPage.DocumentNode.SelectSingleNode("//span[contains(@class,'previous_page')]/parent::div/*[last()-1]");

            if (!int.TryParse(maxPageNode?.InnerHtml, out var maxPages))
            {
                maxPages = 1;
            }

            IEnumerable <NotableClip> ParseQuotePage(HtmlDocument quoteDoc)
            {
                var tempNodes = quoteDoc.DocumentNode.SelectNodes("//div[@class='quotes']/div[@class='quote']");

                return(tempNodes?.Select(node =>
                {
                    var quoteMatch = Regex.Match(node.InnerText, "&ldquo;(.*?)&rdquo;", RegexOptions.Compiled);
                    var likesMatch = Regex.Match(node.SelectSingleNode(".//div[@class='right']/a")?.InnerText ?? "",
                                                 @"(\d+) likes", RegexOptions.Compiled);
                    if (!quoteMatch.Success || !likesMatch.Success)
                    {
                        return null;
                    }
                    return new NotableClip
                    {
                        Text = quoteMatch.Groups[1].Value,
                        Likes = int.Parse(likesMatch.Groups[1].Value)
                    };
                }).Where(quote => quote != null));
            }

            quoteBag.Add(ParseQuotePage(initialPage));
            progress?.Set(1, maxPages);
            await Enumerable.Range(2, maxPages - 1).ParallelForEachAsync(async page =>
            {
                var quotePage = await _httpClient.GetPageAsync(string.Format(quoteURL, page), cancellationToken);
                quoteBag.Add(ParseQuotePage(quotePage));
                progress?.Add(1);
            }, MaxConcurrentRequests, cancellationToken);

            return(quoteBag.Where(quotes => quotes != null && quotes.Any()).SelectMany(quotes => quotes).ToList());
        }
Esempio n. 2
0
        public void AddLocations(XRay xray,
                                 KfxContainer kfx,
                                 bool skipNoLikes,
                                 int minClipLen,
                                 IProgressBar progress,
                                 CancellationToken token)
        {
            _logger.Log("Scanning book content...");

            var paragraphs = _paragraphsService.GetParagraphs(kfx).ToArray();

            // Set start and end of content
            // TODO Figure out how to identify the first *actual* bit of content after the TOC
            var last = paragraphs.Last();

            xray.Srl = 1;
            xray.Erl = last.Location + last.Length - 1;

            progress?.Set(0, paragraphs.Length);
            foreach (var paragraph in paragraphs)
            {
                token.ThrowIfCancellationRequested();

                foreach (var character in xray.Terms.Where(term => term.Match))
                {
                    var occurrences = _termsService.FindOccurrences(kfx, character, paragraph);
                    if (!occurrences.Any())
                    {
                        continue;
                    }

                    character.Occurrences.UnionWith(occurrences);

                    ExcerptHelper.EnhanceOrAddExcerpts(xray.Excerpts, character.Id, new IndexLength(paragraph.Location, paragraph.Length));
                }

                // Attempt to match downloaded notable clips, not worried if no matches occur as some will be added later anyway
                if (xray.NotableClips != null)
                {
                    ExcerptHelper.ProcessNotablesForParagraph(paragraph.ContentText, paragraph.Location, xray.NotableClips, xray.Excerpts, skipNoLikes, minClipLen);
                }

                progress?.Add(1);
            }

            var missingOccurrences = xray.Terms
                                     .Where(term => term.Match && term.Occurrences.Count == 0)
                                     .Select(term => term.TermName)
                                     .ToArray();

            if (!missingOccurrences.Any())
            {
                return;
            }

            var termList = string.Join(", ", missingOccurrences);

            _logger.Log($"\r\nNo locations were found for the following terms. You should add aliases for them using the book as a reference:\r\n{termList}\r\n");
        }
Esempio n. 3
0
        public override async Task <List <XRay.Term> > GetTerms(string dataUrl, IProgressBar progress, CancellationToken token)
        {
            if (sourceHtmlDoc == null)
            {
                Logger.Log("Downloading Goodreads page...");
                sourceHtmlDoc = new HtmlDocument();
                sourceHtmlDoc.LoadHtml(await HttpDownloader.GetPageHtmlAsync(dataUrl));
            }

            var charNodes = sourceHtmlDoc.DocumentNode.SelectNodes("//div[@class='infoBoxRowTitle' and text()='Characters']/../div[@class='infoBoxRowItem']/a");

            if (charNodes == null)
            {
                return(new List <XRay.Term>());
            }
            // Check if ...more link exists on Goodreads page
            var moreCharNodes = sourceHtmlDoc.DocumentNode.SelectNodes("//div[@class='infoBoxRowTitle' and text()='Characters']/../div[@class='infoBoxRowItem']/span[@class='toggleContent']/a");
            var allChars      = moreCharNodes == null ? charNodes : charNodes.Concat(moreCharNodes);
            var termCount     = moreCharNodes == null ? charNodes.Count : charNodes.Count + moreCharNodes.Count;

            Logger.Log($"Gathering term information from Goodreads... ({termCount})");
            progress?.Set(0, termCount);
            if (termCount > 20)
            {
                Logger.Log("More than 20 characters found. Consider using the 'download to XML' option if you need to build repeatedly.");
            }
            var terms = new ConcurrentBag <XRay.Term>();
            await allChars.ParallelForEachAsync(async charNode =>
            {
                try
                {
                    terms.AddNotNull(await GetTerm(dataUrl, charNode.GetAttributeValue("href", "")).ConfigureAwait(false));
                    progress?.Add(1);
                }
                catch (Exception ex)
                {
                    if (ex.Message.Contains("(404)"))
                    {
                        Logger.Log("Error getting page for character. URL: " + "https://www.goodreads.com" + charNode.GetAttributeValue("href", "")
                                   + "\r\nMessage: " + ex.Message + "\r\n" + ex.StackTrace);
                    }
                }
            }, MaxConcurrentRequests, token);

            return(terms.ToList());
        }
Esempio n. 4
0
        /// <summary>
        /// Generate the necessities for the old format
        /// TODO Remove anything that gets generated for the new version
        /// </summary>
        public async Task <Response> GenerateOld(BookInfo curBook, Settings settings, IProgressBar progress = null, CancellationToken cancellationToken = default)
        {
            _logger.Log($"Attempting to find book on Amazon.{settings.AmazonTld}…");
            //Generate Book search URL from book's ASIN
            var ebookLocation = $@"https://www.amazon.{settings.AmazonTld}/dp/{curBook.Asin}";

            HtmlDocument bookHtmlDoc;

            try
            {
                bookHtmlDoc = await _httpClient.GetPageAsync(ebookLocation, cancellationToken);
            }
            catch (Exception ex)
            {
                _logger.Log($"An error occurred while downloading book's Amazon page: {ex.Message}\r\nYour ASIN may not be correct.");
                return(null);
            }
            _logger.Log("Book found on Amazon!");

            try
            {
                var response = _amazonInfoParser.ParseAmazonDocument(bookHtmlDoc);
                response.ApplyToBookInfo(curBook);
            }
            catch (Exception ex)
            {
                _logger.Log($"An error occurred parsing Amazon info: {ex.Message}");
                return(null);
            }

            string ReadDesc(string file)
            {
                try
                {
                    var fileText = Functions.ReadFromFile(file);

                    if (string.IsNullOrEmpty(fileText))
                    {
                        _logger.Log($"Found description file, but it is empty!\r\n{file}");
                    }
                    if (!string.Equals(curBook.Description, fileText))
                    {
                        _logger.Log($"Using biography from {file}.");
                    }

                    return(fileText);
                }
                catch (Exception ex)
                {
                    _logger.Log($"An error occurred while opening {file}\r\n{ex.Message}\r\n{ex.StackTrace}");
                }

                return(null);
            }

            var descFile = Path.Combine(AppDomain.CurrentDomain.BaseDirectory ?? Environment.CurrentDirectory, "ext", $"{curBook.Asin}.desc");

            if (settings.EditDescription)
            {
                if (!File.Exists(descFile) || new FileInfo(descFile).Length == 0)
#if NETFRAMEWORK
                { File.WriteAllText(descFile, curBook.Description); }
#else
                { await File.WriteAllTextAsync(descFile, curBook.Description, cancellationToken); }
#endif
                _logger.Log("Displaying book description for editing...");
                Functions.RunNotepad(descFile);
                curBook.Description = ReadDesc(descFile);
            }

            try
            {
                var listSelectors = new[]
                {
                    "//ol[@class='a-carousel' and @role='list']/li[@class='a-carousel-card']",
                    "//ol[@class='a-carousel' and @role='list']/li[@class='a-carousel-card a-float-left']",
                    "//ol[@class='a-carousel' and @role='list']/li[@class='a-carousel-card aok-float-left']",
                    "//*[contains(@id, 'desktop-dp-sims_purchase-similarities-esp')]/li",
                    "//*[contains(@id, 'dp-sims_OnlineDpSimsPurchaseStrategy-sims')]/li",
                    "//*[@id='desktop-dp-sims_purchase-similarities-sims-feature']/li",
                    "//*[@id='desktop-dp-sims_vtp-60-sims-feature']/li",
                    "//div[@id='desktop-dp-sims_session-similarities-brand-protection-sims-feature']/li",
                    "//div[@id='desktop-dp-sims_session-similarities-sims-feature']/li",
                    "//*[@id='view_to_purchase-sims-feature']/li"
                };

                var relatedBooks = listSelectors.SelectMany(selector =>
                {
                    var listNodes = bookHtmlDoc.DocumentNode.SelectNodes(selector);
                    return(listNodes != null
                            ? ParseBookList(listNodes)
                            : Enumerable.Empty <BookInfo>());
                }).Where(list => list != null)
                                   .Distinct()
                                   .Where(book => !book.Title.ToLower().Contains(curBook.Title.ToLower()) && book.Asin != curBook.Asin && !_invalidBookTitleRegex.IsMatch(curBook.Title))
                                   .ToArray();

                if (settings.UseNewVersion && relatedBooks.Any())
                {
                    _logger.Log($"Gathering metadata for {relatedBooks.Length} related book(s)…");
                    progress?.Set(0, relatedBooks.Length);
                    await foreach (var _ in _amazonClient.EnhanceBookInfos(relatedBooks, cancellationToken))
                    {
                        progress?.Add(1);
                    }
                }

                return(new Response
                {
                    Book = curBook,
                    CustomerAlsoBought = relatedBooks
                });
            }
            catch (Exception ex)
            {
                _logger.Log($"An error occurred parsing the book's amazon page: {ex.Message}{ex.StackTrace}");
                return(null);
            }
        }
        // TODO split this up, possible return a result instead of modifying xray
        public void ExpandFromRawMl(
            XRay xray,
            IMetadata metadata,
            Stream rawMlStream,
            bool enableEdit,
            bool useNewVersion,
            bool skipNoLikes,
            int minClipLen,
            bool overwriteChapters,
            SafeShowDelegate safeShow,
            IProgressBar progress,
            CancellationToken token,
            bool ignoreSoftHypen = false,
            bool shortEx         = true)
        {
            var locOffset = metadata.IsAzw3 ? -16 : 0;

            // If there is an apostrophe, attempt to match 's at the end of the term
            // Match end of word, then search for any lingering punctuation
            var apostrophes      = _encoding.GetString(Encoding.UTF8.GetBytes("('|\u2019|\u0060|\u00B4)"));                                                                     // '\u2019\u0060\u00B4
            var quotes           = _encoding.GetString(Encoding.UTF8.GetBytes("(\"|\u2018|\u2019|\u201A|\u201B|\u201C|\u201D|\u201E|\u201F)"));
            var dashesEllipsis   = _encoding.GetString(Encoding.UTF8.GetBytes("(-|\u2010|\u2011|\u2012|\u2013|\u2014|\u2015|\u2026|&#8211;|&#8212;|&#8217;|&#8218;|&#8230;)")); //U+2010 to U+2015 and U+2026
            var punctuationMarks = string.Format(@"({0}s|{0})?{1}?[!\.?,""\);:]*{0}*{1}*{2}*", apostrophes, quotes, dashesEllipsis);

            var excerptId = 0;
            var web       = new HtmlDocument();

            web.Load(rawMlStream, _encoding);

            // Only load chapters when building the old format
            if (!useNewVersion)
            {
                rawMlStream.Seek(0, SeekOrigin.Begin);
                // TODO: passing stream, doc, and contents probably not necessary)
                using var streamReader = new StreamReader(rawMlStream, Encoding.UTF8);
                var readContents = streamReader.ReadToEnd();
                var utf8Doc      = new HtmlDocument();
                utf8Doc.LoadHtml(readContents);
                _chaptersService.HandleChapters(xray, xray.Asin, rawMlStream.Length, utf8Doc, readContents, overwriteChapters, safeShow, xray.Unattended, enableEdit);
            }
            else
            {
                // set default ERL to prevent filtering
                xray.Srl = 1;
                xray.Erl = rawMlStream.Length;
            }

            _logger.Log("Scanning book content...");
            var timer = new System.Diagnostics.Stopwatch();

            timer.Start();
            //Iterate over all paragraphs in book
            var nodes = web.DocumentNode.SelectNodes("//p")
                        ?? web.DocumentNode.SelectNodes("//div[@class='paragraph']")
                        ?? web.DocumentNode.SelectNodes("//div[@class='p-indent']");

            if (nodes == null)
            {
                nodes = web.DocumentNode.SelectNodes("//div");
                _logger.Log("Warning: Could not locate paragraphs normally (p elements or divs of class 'paragraph').\r\n" +
                            "Searching all book contents (all divs), which may produce odd results.");
            }
            if (nodes == null)
            {
                throw new Exception("Could not locate any paragraphs in this book.\r\n" +
                                    "Report this error along with a copy of the book to improve parsing.");
            }
            progress?.Set(0, nodes.Count);
            for (var i = 0; i < nodes.Count; i++)
            {
                token.ThrowIfCancellationRequested();
                var node = nodes[i];
                if (node.FirstChild == null)
                {
                    continue;                          //If the inner HTML is just empty, skip the paragraph!
                }
                var lenQuote = node.InnerHtml.Length;
                var location = node.FirstChild.StreamPosition;
                if (location < 0)
                {
                    throw new Exception($"Unable to locate paragraph {i} within the book content.");
                }

                //Skip paragraph if outside chapter range
                if (location < xray.Srl || location > xray.Erl)
                {
                    continue;
                }
                var noSoftHypen = "";
                if (ignoreSoftHypen)
                {
                    noSoftHypen = node.InnerText;
                    noSoftHypen = noSoftHypen.Replace("\u00C2\u00AD", "");
                    noSoftHypen = noSoftHypen.Replace("&shy;", "");
                    noSoftHypen = noSoftHypen.Replace("&#xad;", "");
                    noSoftHypen = noSoftHypen.Replace("&#173;", "");
                    noSoftHypen = noSoftHypen.Replace("&#0173;", "");
                }
                foreach (var character in xray.Terms)
                {
                    //Search for character name and aliases in the html-less text. If failed, try in the HTML for rare situations.
                    //TODO: Improve location searching as IndexOf will not work if book length exceeds 2,147,483,647...
                    //If soft hyphen ignoring is turned on, also search hyphen-less text.
                    if (!character.Match)
                    {
                        continue;
                    }
                    var termFound = false;
                    // Convert from UTF8 string to default-encoded representation
                    var search = character.Aliases.Select(alias => _encoding.GetString(Encoding.UTF8.GetBytes(alias)))
                                 .ToList();
                    if (character.RegexAliases)
                    {
                        if (search.Any(r => Regex.Match(node.InnerText, r).Success) ||
                            search.Any(r => Regex.Match(node.InnerHtml, r).Success) ||
                            (ignoreSoftHypen && search.Any(r => Regex.Match(noSoftHypen, r).Success)))
                        {
                            termFound = true;
                        }
                    }
                    else
                    {
                        // Search for character name and aliases
                        // If there is an apostrophe, attempt to match 's at the end of the term
                        // Match end of word, then search for any lingering punctuation
                        search.Add(character.TermName);
                        // Search list should be in descending order by length, even the term name itself
                        search = search.OrderByDescending(s => s.Length).ToList();
                        if ((character.MatchCase && (search.Any(node.InnerText.Contains) || search.Any(node.InnerHtml.Contains))) ||
                            (!character.MatchCase && (search.Any(node.InnerText.ContainsIgnorecase) || search.Any(node.InnerHtml.ContainsIgnorecase))) ||
                            (ignoreSoftHypen && (character.MatchCase && search.Any(noSoftHypen.Contains)) ||
                             (!character.MatchCase && search.Any(noSoftHypen.ContainsIgnorecase))))
                        {
                            termFound = true;
                        }
                    }

                    if (!termFound)
                    {
                        continue;
                    }

                    var locHighlight = new List <int>();
                    var lenHighlight = new List <int>();
                    //Search html for character name and aliases
                    foreach (var s in search)
                    {
                        var matches = Regex.Matches(node.InnerHtml, $@"{quotes}?\b{s}{punctuationMarks}", character.MatchCase || character.RegexAliases ? RegexOptions.None : RegexOptions.IgnoreCase);
                        foreach (Match match in matches)
                        {
                            if (locHighlight.Contains(match.Index) && lenHighlight.Contains(match.Length))
                            {
                                continue;
                            }
                            locHighlight.Add(match.Index);
                            lenHighlight.Add(match.Length);
                        }
                    }
                    //If normal search fails, use regexp to search in case there is some wacky html nested in term
                    //Regexp may be less than ideal for parsing HTML but seems to work ok so far in these small paragraphs
                    //Also search in soft hyphen-less text if option is set to do so
                    if (locHighlight.Count == 0)
                    {
                        foreach (var s in search)
                        {
                            var          patterns    = new List <string>();
                            const string patternHtml = "(?:<[^>]*>)*";
                            //Match HTML tags -- provided there's nothing malformed
                            const string patternSoftHypen = "(\u00C2\u00AD|&shy;|&#173;|&#xad;|&#0173;|&#x00AD;)*";
                            var          pattern          = string.Format("{0}{1}{0}{2}",
                                                                          patternHtml,
                                                                          string.Join(patternHtml + patternSoftHypen, character.RegexAliases ? s.ToCharArray() : Regex.Unescape(s).ToCharArray()),
                                                                          punctuationMarks);
                            patterns.Add(pattern);
                            foreach (var pat in patterns)
                            {
                                MatchCollection matches;
                                if (character.MatchCase || character.RegexAliases)
                                {
                                    matches = Regex.Matches(node.InnerHtml, pat);
                                }
                                else
                                {
                                    matches = Regex.Matches(node.InnerHtml, pat, RegexOptions.IgnoreCase);
                                }
                                foreach (Match match in matches)
                                {
                                    if (locHighlight.Contains(match.Index) && lenHighlight.Contains(match.Length))
                                    {
                                        continue;
                                    }
                                    locHighlight.Add(match.Index);
                                    lenHighlight.Add(match.Length);
                                }
                            }
                        }
                    }
                    if (locHighlight.Count == 0 || locHighlight.Count != lenHighlight.Count) //something went wrong
                    {
                        // _logger.Log($"An error occurred while searching for start of highlight.\r\nWas looking for (or one of the aliases of): {character.TermName}\r\nSearching in: {node.InnerHtml}");
                        continue;
                    }

                    //If an excerpt is too long, the X-Ray reader cuts it off.
                    //If the location of the highlighted word (character name) within the excerpt is far enough in to get cut off,
                    //this section attempts to shorted the excerpt by locating the start of a sentence that is just far enough away from the highlight.
                    //The length is determined by the space the excerpt takes up rather than its actual length... so 135 is just a guess based on what I've seen.
                    const int lengthLimit = 135;
                    for (var j = 0; j < locHighlight.Count; j++)
                    {
                        if (!shortEx || locHighlight[j] + lenHighlight[j] <= lengthLimit)
                        {
                            continue;
                        }
                        var  start           = locHighlight[j];
                        long newLoc          = -1;
                        var  newLenQuote     = 0;
                        var  newLocHighlight = 0;

                        while (start > -1)
                        {
                            var at = node.InnerHtml.LastIndexOfAny(new[] { '.', '?', '!' }, start);
                            if (at > -1)
                            {
                                start = at - 1;
                                if (locHighlight[j] + lenHighlight[j] + 1 - at - 2 <= lengthLimit)
                                {
                                    newLoc          = location + at + 2;
                                    newLenQuote     = lenQuote - at - 2;
                                    newLocHighlight = locHighlight[j] - at - 2;
                                }
                                else
                                {
                                    break;
                                }
                            }
                            else
                            {
                                break;
                            }
                        }
                        //Only add new locs if shorter excerpt was found
                        if (newLoc >= 0)
                        {
                            character.Locs.Add(new []
                            {
                                newLoc + locOffset,
                                newLenQuote,
                                newLocHighlight,
                                lenHighlight[j]
                            });
                            locHighlight.RemoveAt(j);
                            lenHighlight.RemoveAt(j--);
                        }
                    }

                    for (var j = 0; j < locHighlight.Count; j++)
                    {
                        // For old format
                        character.Locs.Add(new long[]
                        {
                            location + locOffset,
                            lenQuote,
                            locHighlight[j],
                            lenHighlight[j]
                        });
                        // For new format
                        character.Occurrences.Add(new[] { location + locOffset + locHighlight[j], lenHighlight[j] });
                    }
                    var exCheck = xray.Excerpts.Where(t => t.Start.Equals(location + locOffset)).ToArray();
                    if (exCheck.Length > 0)
                    {
                        if (!exCheck[0].RelatedEntities.Contains(character.Id))
                        {
                            exCheck[0].RelatedEntities.Add(character.Id);
                        }
                    }
                    else
                    {
                        var newExcerpt = new Excerpt
                        {
                            Id     = excerptId++,
                            Start  = location + locOffset,
                            Length = lenQuote
                        };
                        newExcerpt.RelatedEntities.Add(character.Id);
                        xray.Excerpts.Add(newExcerpt);
                    }
                }

                // Attempt to match downloaded notable clips, not worried if no matches occur as some will be added later anyway
                if (useNewVersion && xray.NotableClips != null)
                {
                    foreach (var quote in xray.NotableClips)
                    {
                        var index = node.InnerText.IndexOf(quote.Text, StringComparison.Ordinal);
                        if (index > -1)
                        {
                            // See if an excerpt already exists at this location
                            var excerpt = xray.Excerpts.FirstOrDefault(e => e.Start == index);
                            if (excerpt == null)
                            {
                                if (skipNoLikes && quote.Likes == 0 ||
                                    quote.Text.Length < minClipLen)
                                {
                                    continue;
                                }
                                excerpt = new Excerpt
                                {
                                    Id         = excerptId++,
                                    Start      = location,
                                    Length     = node.InnerHtml.Length,
                                    Notable    = true,
                                    Highlights = quote.Likes
                                };
                                excerpt.RelatedEntities.Add(0); // Mark the excerpt as notable
                                // TODO: also add other related entities
                                xray.Excerpts.Add(excerpt);
                            }
                            else
                            {
                                excerpt.RelatedEntities.Add(0);
                            }

                            xray.FoundNotables++;
                        }
                    }
                }
                progress?.Add(1);
            }

            timer.Stop();
            _logger.Log($"Scan time: {timer.Elapsed}");
            //output list of terms with no locs
            foreach (var t in xray.Terms.Where(t => t.Match && t.Locs.Count == 0))
            {
                _logger.Log($"No locations were found for the term \"{t.TermName}\".\r\nYou should add aliases for this term using the book or rawml as a reference.");
            }
        }
Esempio n. 6
0
        // TODO split this up, possible return a result instead of modifying xray
        public void ExpandFromRawMl(
            XRay xray,
            IMetadata metadata,
            Stream rawMlStream,
            bool useNewVersion,
            bool skipNoLikes,
            int minClipLen,
            bool overwriteChapters,
            Func <bool> editChaptersCallback,
            IProgressBar progress,
            CancellationToken token,
            bool ignoreSoftHypen = false,
            bool shortEx         = true)
        {
            // Only load chapters when building the old format
            if (!useNewVersion)
            {
                rawMlStream.Seek(0, SeekOrigin.Begin);
                // TODO: passing stream, doc, and contents probably not necessary)
                using var streamReader = new StreamReader(rawMlStream, Encoding.UTF8);
                var readContents = streamReader.ReadToEnd();
                var utf8Doc      = new HtmlDocument();
                utf8Doc.LoadHtml(readContents);

                _chaptersService.HandleChapters(xray, xray.Asin, rawMlStream.Length, utf8Doc, readContents, overwriteChapters, editChaptersCallback);
            }
            else
            {
                // set default ERL to prevent filtering
                xray.Srl = 1;
                xray.Erl = rawMlStream.Length;
            }

            _logger.Log(CoreStrings.ScanningEbookContent);
            var timer = new System.Diagnostics.Stopwatch();

            timer.Start();

            var paragraphs = _paragraphsService.GetParagraphs(metadata).ToArray();

            if (!paragraphs.Any())
            {
                throw new Exception(CoreStrings.CouldNotLocateAnyParagraphs);
            }

            progress?.Set(0, paragraphs.Length);
            foreach (var paragraph in paragraphs)
            {
                token.ThrowIfCancellationRequested();

                //Skip paragraph if outside known chapter range or if html is missing (shouldn't be, just a safety check)
                if (paragraph.Location < xray.Srl || paragraph.Location > xray.Erl || paragraph.ContentHtml == null)
                {
                    continue;
                }

                var noSoftHypen = "";
                if (ignoreSoftHypen)
                {
                    noSoftHypen = paragraph.ContentText;
                    noSoftHypen = noSoftHypen.Replace("\u00C2\u00AD", "");
                    noSoftHypen = noSoftHypen.Replace("&shy;", "");
                    noSoftHypen = noSoftHypen.Replace("&#xad;", "");
                    noSoftHypen = noSoftHypen.Replace("&#173;", "");
                    noSoftHypen = noSoftHypen.Replace("&#0173;", "");
                }

                foreach (var character in xray.Terms)
                {
                    //Search for character name and aliases in the html-less text. If failed, try in the HTML for rare situations.
                    //TODO: Improve location searching as IndexOf will not work if book length exceeds 2,147,483,647...
                    //If soft hyphen ignoring is turned on, also search hyphen-less text.
                    if (!character.Match)
                    {
                        continue;
                    }

                    var termFound = false;
                    // Convert from UTF8 string to default-encoded representation
                    var search = character.Aliases.Select(alias => _encoding.GetString(Encoding.UTF8.GetBytes(alias)))
                                 .ToList();
                    if (character.RegexAliases)
                    {
                        if (search.Any(r => Regex.Match(paragraph.ContentText, r).Success) ||
                            search.Any(r => Regex.Match(paragraph.ContentHtml !, r).Success) ||
                            (ignoreSoftHypen && search.Any(r => Regex.Match(noSoftHypen, r).Success)))
                        {
                            termFound = true;
                        }
                    }
                    else
                    {
                        // Search for character name and aliases
                        // If there is an apostrophe, attempt to match 's at the end of the term
                        // Match end of word, then search for any lingering punctuation
                        search.Add(character.TermName);
                        // Search list should be in descending order by length, even the term name itself
                        search = search.OrderByDescending(s => s.Length).ToList();

                        // TODO consider removing this "termfound" section 'cause it might be redundant and pointless now
                        if ((character.MatchCase && (search.Any(paragraph.ContentText.Contains) || search.Any(paragraph.ContentHtml.Contains))) ||
                            (!character.MatchCase && (search.Any(paragraph.ContentText.ContainsIgnorecase) || search.Any(paragraph.ContentHtml.ContainsIgnorecase))) ||
                            (ignoreSoftHypen && (character.MatchCase && search.Any(noSoftHypen.Contains)) ||
                             (!character.MatchCase && search.Any(noSoftHypen.ContainsIgnorecase))))
                        {
                            termFound = true;
                        }
                    }

                    if (!termFound)
                    {
                        continue;
                    }

                    var occurrences = _termsService.FindOccurrences(metadata, character, paragraph);
                    if (!occurrences.Any())
                    {
                        // _logger.Log($"An error occurred while searching for start of highlight.\r\nWas looking for (or one of the aliases of): {character.TermName}\r\nSearching in: {node.InnerHtml}");
                        continue;
                    }

                    character.Occurrences.UnionWith(occurrences);

                    ExcerptHelper.EnhanceOrAddExcerpts(xray.Excerpts, character.Id, new IndexLength(paragraph.Location, paragraph.Length));
                }

                // Attempt to match downloaded notable clips, not worried if no matches occur as some will be added later anyway
                if (useNewVersion && xray.NotableClips != null)
                {
                    ExcerptHelper.ProcessNotablesForParagraph(paragraph.ContentText, paragraph.Location, xray.NotableClips, xray.Excerpts, skipNoLikes, minClipLen);
                }

                progress?.Add(1);
            }

            timer.Stop();
            _logger.Log(string.Format(CoreStrings.ScanTime, timer.Elapsed));
            //output list of terms with no occurrences
            foreach (var t in xray.Terms.Where(t => t.Match && t.Occurrences.Count == 0))
            {
                _logger.Log(string.Format(CoreStrings.NoLocationsFoundForTerm, t.TermName));
            }
        }
Esempio n. 7
0
        public void AddLocations(XRay xray,
                                 KfxContainer kfx,
                                 bool skipNoLikes,
                                 int minClipLen,
                                 IProgressBar progress,
                                 CancellationToken token)
        {
            _logger.Log("Scanning book content...");
            var contentChunks = kfx.GetContentChunks();

            // Set start and end of content
            // TODO Figure out how to identify the first *actual* bit of content after the TOC
            var last = contentChunks.Last();

            xray.Srl = 1;
            xray.Erl = last.Pid + last.Length - 1;

            var offset    = 0;
            var excerptId = 0;

            progress?.Set(0, contentChunks.Count);
            foreach (var contentChunk in contentChunks)
            {
                token.ThrowIfCancellationRequested();

                if (contentChunk.ContentText != null)
                {
                    foreach (var character in xray.Terms.Where(term => term.Match))
                    {
                        // If the aliases are not supposed to be in regex format, escape them
                        var aliases = character.RegexAliases
                            ? character.Aliases
                            : character.Aliases.Select(Regex.Escape);

                        var searchList = new[] { character.TermName }.Concat(aliases).ToArray();

                        //Search content for character name and aliases, respecting the case setting
                        var regexOptions = character.MatchCase || character.RegexAliases
                            ? RegexOptions.None
                            : RegexOptions.IgnoreCase;

                        var currentOffset = offset;
                        var highlights    = searchList
                                            .Select(search => Regex.Matches(contentChunk.ContentText, $@"{Quotes}?\b{search}{_punctuationMarks}", regexOptions))
                                            .SelectMany(matches => matches.Cast <Match>())
                                            .ToLookup(match => currentOffset + match.Index, match => match.Length);

                        if (highlights.Count == 0)
                        {
                            continue;
                        }

                        var highlightOccurrences = highlights.SelectMany(highlightGroup => highlightGroup.Select(highlight => new[] { highlightGroup.Key, highlight }));
                        character.Occurrences.AddRange(highlightOccurrences);

                        // Check excerpts
                        var exCheck = xray.Excerpts.Where(t => t.Start.Equals(offset)).ToArray();
                        if (exCheck.Length > 0)
                        {
                            if (!exCheck[0].RelatedEntities.Contains(character.Id))
                            {
                                exCheck[0].RelatedEntities.Add(character.Id);
                            }
                        }
                        else
                        {
                            var newExcerpt = new Excerpt
                            {
                                Id     = excerptId++,
                                Start  = offset,
                                Length = contentChunk.Length
                            };
                            newExcerpt.RelatedEntities.Add(character.Id);
                            xray.Excerpts.Add(newExcerpt);
                        }
                    }

                    // Attempt to match downloaded notable clips, not worried if no matches occur as some will be added later anyway
                    if (xray.NotableClips != null)
                    {
                        foreach (var quote in xray.NotableClips)
                        {
                            var index = contentChunk.ContentText.IndexOf(quote.Text, StringComparison.Ordinal);
                            if (index <= -1)
                            {
                                continue;
                            }

                            // See if an excerpt already exists at this location
                            var excerpt = xray.Excerpts.FirstOrDefault(e => e.Start == index);
                            if (excerpt == null)
                            {
                                if (skipNoLikes && quote.Likes == 0 ||
                                    quote.Text.Length < minClipLen)
                                {
                                    continue;
                                }
                                excerpt = new Excerpt
                                {
                                    Id         = excerptId++,
                                    Start      = offset,
                                    Length     = contentChunk.Length,
                                    Notable    = true,
                                    Highlights = quote.Likes
                                };
                                excerpt.RelatedEntities.Add(0); // Mark the excerpt as notable
                                // TODO: also add other related entities
                                xray.Excerpts.Add(excerpt);
                            }
                            else
                            {
                                excerpt.RelatedEntities.Add(0);
                            }

                            xray.FoundNotables++;
                        }
                    }

                    progress?.Add(1);
                }

                offset += contentChunk.Length;
            }

            var missingOccurrences = xray.Terms
                                     .Where(term => term.Match && term.Occurrences.Count == 0)
                                     .Select(term => term.TermName)
                                     .ToArray();

            if (!missingOccurrences.Any())
            {
                return;
            }

            var termList = string.Join(", ", missingOccurrences);

            _logger.Log($"\r\nNo locations were found for the following terms. You should add aliases for them using the book as a reference:\r\n{termList}\r\n");
        }
Esempio n. 8
0
        // TODO: Review this...
        public async Task <Response> GenerateAsync(Request request, Func <string, bool> editBioCallback, IProgressBar progress = null, CancellationToken cancellationToken = default)
        {
            AuthorSearchResults searchResults = null;

            // Attempt to download from the alternate site, if present. If it fails in some way, try .com
            // If the .com search crashes, it will crash back to the caller in frmMain
            try
            {
                searchResults = await _amazonClient.SearchAuthor(request.Book.Author, request.Settings.AmazonTld, cancellationToken);
            }
            catch (Exception ex)
            {
                _logger.Log($"Error searching Amazon.{request.Settings.AmazonTld}: {ex.Message}\r\n{ex.StackTrace}");
            }
            finally
            {
                if (searchResults == null)
                {
                    _logger.Log($"Failed to find {request.Book.Author} on Amazon.{request.Settings.AmazonTld}");
                    if (request.Settings.AmazonTld != "com")
                    {
                        _logger.Log("Trying again with Amazon.com.");
                        request.Settings.AmazonTld = "com";
                        searchResults = await _amazonClient.SearchAuthor(request.Book.Author, request.Settings.AmazonTld, cancellationToken);
                    }
                }
            }

            if (searchResults == null)
            {
                return(null); // Already logged error in search function
            }
            // Filter out any results that are the same title but not the same asin
            searchResults.Books = searchResults.Books
                                  .Where(book => !book.Title.ToLower().Contains(request.Book.Title.ToLower()) && book.Asin != request.Book.Asin)
                                  .ToArray();

            var authorAsin = searchResults.Asin;

            //todo re-implement saving in a nicer way
//            if (Properties.Settings.Default.saveHtml)
//            {
//                try
//                {
//                    _logger.Log("Saving author's Amazon webpage...");
//                    File.WriteAllText(AppDomain.CurrentDomain.BaseDirectory + string.Format(@"\dmp\{0}.authorpageHtml.txt", request.Book.Asin),
//                        searchResults.AuthorHtmlDoc.DocumentNode.InnerHtml);
//                }
//                catch (Exception ex)
//                {
//                    _logger.Log(string.Format("An error occurred saving authorpageHtml.txt: {0}", ex.Message));
//                }
//            }

            // TODO: Separate out biography stuff
            // Try to find author's biography
            string biography    = null;
            var    bioFile      = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ext", $"{authorAsin}.bio");
            var    readFromFile = false;
            var    newBioFile   = false;

            string ReadBio(string file)
            {
                try
                {
                    var fileText = Functions.ReadFromFile(file);
                    if (string.IsNullOrEmpty(fileText))
                    {
                        _logger.Log($"Found biography file, but it is empty!\r\n{file}");
                    }
                    else if (!string.Equals(biography, fileText))
                    {
                        _logger.Log($"Using biography from {file}.");
                    }

                    // todo fix this
                    if (fileText != null && fileText.Contains("No author biography found locally or on Amazon!"))
                    {
                        _logger.Log($"Warning: Local biography file contains an empty default biography.{Environment.NewLine}Delete {file} and try again");
                    }

                    return(fileText);
                }
                catch (Exception ex)
                {
                    _logger.Log($"An error occurred while opening {file}\r\n{ex.Message}\r\n{ex.StackTrace}");
                }

                return(null);
            }

            string TrimBio(string bio)
            {
                try
                {
                    //Trim author biography to less than 1000 characters and/or replace more problematic characters.
                    if (string.IsNullOrWhiteSpace(bio))
                    {
                        return(null);
                    }

                    if (bio.Length > 1000)
                    {
                        // todo culture invariant
                        var lastPunc  = bio.LastIndexOfAny(new[] { '.', '!', '?' });
                        var lastSpace = bio.LastIndexOf(' ');

                        bio = lastPunc > lastSpace
                            ? bio.Substring(0, lastPunc + 1)
                            : $"{bio.Substring(0, lastSpace)}{'\u2026'}";
                    }

                    return(bio.Clean());
                }
                catch (Exception ex)
                {
                    _logger.Log($"An error occurred while trimming the biography\r\n{ex.Message}\r\n{ex.StackTrace}");
                }

                return(bio);
            }

            if (searchResults.Biography == null && !File.Exists(bioFile))
            {
                if (request.Settings.AmazonTld != "com")
                {
                    _logger.Log(@"Searching for biography on Amazon.com…");
                    request.Settings.AmazonTld = "com";
                    var tempSearchResults = await _amazonClient.SearchAuthor(request.Book.Author, request.Settings.AmazonTld, cancellationToken, false);

                    if (tempSearchResults?.Biography != null)
                    {
                        searchResults.Biography = tempSearchResults.Biography;
                    }
                }
            }

            if (searchResults.Biography != null)
            {
                biography = searchResults.Biography;
            }

            if (File.Exists(bioFile) && request.Settings.SaveBio)
            {
                biography = ReadBio(bioFile);
                // if it's null, there was an error. if it's just empty, we'll parse it out instead
                if (biography == null)
                {
                    return(null);
                }
                if (!string.IsNullOrEmpty(biography))
                {
                    readFromFile = true;
                }
            }

            if (!string.IsNullOrEmpty(biography) && (request.Settings.SaveBio || request.Settings.EditBiography))
            {
                if (!readFromFile)
                {
                    biography = TrimBio(biography);
                }

                if (!File.Exists(bioFile) && !string.IsNullOrEmpty(biography))
                {
                    File.WriteAllText(bioFile, biography);
                    newBioFile = true;
                    _logger.Log(@"Author biography found!");
                }
            }

            var message = biography == null
                ? $"No author biography found on Amazon!{Environment.NewLine}Would you like to create one?"
                : readFromFile
                    ? "Would you like to edit the existing biography?"
                    : $"Author biography found on Amazon!{Environment.NewLine}Would you like to edit it?";

            if (editBioCallback != null && editBioCallback(message))
            {
                if (!File.Exists(bioFile))
                {
                    File.WriteAllText(bioFile, string.Empty);
                }
                Functions.RunNotepad(bioFile);
                biography = ReadBio(bioFile);
            }

            if (string.IsNullOrEmpty(biography))
            {
                biography = "No author biography found locally or on Amazon!";
                _logger.Log("An error occurred finding the author biography.");
            }

            if (request.Settings.SaveBio)
            {
                if (!File.Exists(bioFile))
                {
                    try
                    {
                        _logger.Log($"Saving biography to {bioFile}");
                        using var streamWriter = new StreamWriter(bioFile, false, System.Text.Encoding.UTF8);
                        await streamWriter.WriteAsync(biography);
                    }
                    catch (Exception ex)
                    {
                        _logger.Log($"An error occurred while writing biography.\r\n{ex.Message}\r\n{ex.StackTrace}");
                        return(null);
                    }
                }

                if (newBioFile)
                {
                    _logger.Log(@"New biography file opened in notepad for editing…");
                    Functions.RunNotepad(bioFile);
                    biography = ReadBio(bioFile);
                    if (string.IsNullOrEmpty(biography))
                    {
                        return(null);
                    }
                    searchResults.Biography = biography;
                }

                if (!readFromFile && editBioCallback != null && editBioCallback("Would you like to open the biography file in notepad for editing?"))
                {
                    Functions.RunNotepad(bioFile);
                    biography = ReadBio(bioFile);
                    if (string.IsNullOrEmpty(biography))
                    {
                        return(null);
                    }
                }
            }

            searchResults.Biography = biography;

            // Try to download Author image
            request.Book.AuthorImageUrl = searchResults.ImageUrl;

            Bitmap ApAuthorImage = null;

            try
            {
                _logger.Log("Downloading author image…");
                ApAuthorImage = await _httpClient.GetImageAsync(request.Book.AuthorImageUrl, cancellationToken : cancellationToken);

                _logger.Log("Grayscale base64-encoded author image created!");
            }
            catch (Exception ex)
            {
                _logger.Log($"An error occurred downloading the author image: {ex.Message}");
            }

            var bookBag = new ConcurrentBag <BookInfo>();

            if (searchResults.Books != null && request.Settings.UseNewVersion)
            {
                if (searchResults.Books.Length != 0)
                {
                    // todo pluralize
                    _logger.Log(searchResults.Books.Length > 1
                        ? $"Gathering metadata for {searchResults.Books.Length} other books by {request.Book.Author}…"
                        : $"Gathering metadata for another book by {request.Book.Author}…");
                }
                try
                {
                    progress?.Set(0, searchResults.Books.Length);
                    await _amazonClient
                    .EnhanceBookInfos(searchResults.Books, cancellationToken)
                    .ForEachAsync(book =>
                    {
                        bookBag.Add(book);
                        progress?.Add(1);
                    }, cancellationToken);

                    progress?.Set(0, 0);
                    _logger.Log("Metadata gathering complete!");
                }
                catch (Exception ex)
                {
                    _logger.Log($"An error occurred gathering metadata for other books: {ex.Message}");
                    throw;
                }
            }
            else
            {
                _logger.Log($"Unable to find other books by {request.Book.Author}. If there should be some, check the Amazon URL to ensure it is correct.");
            }

            _logger.Log("Writing Author Profile to file…");

            return(new Response
            {
                Asin = authorAsin,
                Name = request.Book.Author,
                OtherBooks = bookBag.ToArray(),
                Biography = biography,
                Image = ApAuthorImage,
                ImageUrl = searchResults.ImageUrl
            });
        }
Esempio n. 9
0
        /// <summary>
        /// Populates a <paramref name="db"/> with the data from <paramref name="xray"/>
        /// </summary>
        private void Populate(XRay xray, SQLiteConnection db, IProgressBar progress, CancellationToken token = default)
        {
            var sql         = new StringBuilder(xray.Terms.Count * 256);
            var personCount = 0;
            var termCount   = 0;
            var command     = new SQLiteCommand($"update string set text='{xray.DataUrl}' where id=15", db);

            command.ExecuteNonQuery();

            _logger.Log("Updating database with terms, descriptions, and excerpts...");
            //Write all entities and occurrences
            _logger.Log($"Writing {xray.Terms.Count} terms...");
            progress?.Set(0, xray.Terms.Count);
            command = new SQLiteCommand("insert into entity (id, label, loc_label, type, count, has_info_card) values (@id, @label, null, @type, @count, 1)", db);
            var command2 = new SQLiteCommand("insert into entity_description (text, source_wildcard, source, entity) values (@text, @source_wildcard, @source, @entity)", db);
            var command3 = new SQLiteCommand("insert into occurrence (entity, start, length) values (@entity, @start, @length)", db);

            foreach (var t in xray.Terms)
            {
                token.ThrowIfCancellationRequested();
                if (t.Type == "character")
                {
                    personCount++;
                }
                else if (t.Type == "topic")
                {
                    termCount++;
                }
                command.Parameters.Add("@id", DbType.Int32).Value     = t.Id;
                command.Parameters.Add("@label", DbType.String).Value = t.TermName;
                command.Parameters.Add("@type", DbType.Int32).Value   = t.Type == "character" ? 1 : 2;
                command.Parameters.Add("@count", DbType.Int32).Value  = t.Occurrences.Count;
                command.ExecuteNonQuery();

                command2.Parameters.Add("@text", DbType.String).Value            = string.IsNullOrEmpty(t.Desc) ? "No description available." : t.Desc;
                command2.Parameters.Add("@source_wildcard", DbType.String).Value = t.TermName;
                command2.Parameters.Add("@source", DbType.Int32).Value           = t.DescSrc == "shelfari" ? 4 : 6;
                command2.Parameters.Add("@entity", DbType.Int32).Value           = t.Id;
                command2.ExecuteNonQuery();

                foreach (var occurrence in t.Occurrences)
                {
                    command3.Parameters.Add("@entity", DbType.Int32).Value = t.Id;
                    command3.Parameters.Add("@start", DbType.Int32).Value  = occurrence.Excerpt.Index + occurrence.Highlight.Index;
                    command3.Parameters.Add("@length", DbType.Int32).Value = occurrence.Highlight.Length;
                    command3.ExecuteNonQuery();
                }
                progress?.Add(1);
            }

            //Write excerpts and entity_excerpt table
            _logger.Log($"Writing {xray.Excerpts.Count} excerpts...");
            command.CommandText = "insert into excerpt (id, start, length, image, related_entities, goto) values (@id, @start, @length, @image, @rel_ent, null);";
            command.Parameters.Clear();
            command2.CommandText = "insert into entity_excerpt (entity, excerpt) values (@entityId, @excerptId)";
            command2.Parameters.Clear();
            progress?.Set(0, xray.Excerpts.Count);
            foreach (var e in xray.Excerpts)
            {
                token.ThrowIfCancellationRequested();
                command.Parameters.Add("id", DbType.Int32).Value       = e.Id;
                command.Parameters.Add("start", DbType.Int32).Value    = e.Start;
                command.Parameters.Add("length", DbType.Int32).Value   = e.Length;
                command.Parameters.Add("image", DbType.String).Value   = e.Image;
                command.Parameters.Add("rel_ent", DbType.String).Value = string.Join(",", e.RelatedEntities.Where(en => en != 0).ToArray()); // don't write 0 (notable flag)
                command.ExecuteNonQuery();
                foreach (var ent in e.RelatedEntities)
                {
                    token.ThrowIfCancellationRequested();
                    if (ent == 0)
                    {
                        continue;           // skip notable flag
                    }
                    command2.Parameters.Add("@entityId", DbType.Int32).Value  = ent;
                    command2.Parameters.Add("@excerptId", DbType.Int32).Value = e.Id;
                    command2.ExecuteNonQuery();
                }
                progress?.Add(1);
            }

            // create links to notable clips in order of popularity
            _logger.Log("Adding notable clips...");
            command.Parameters.Clear();
            var notablesOnly = xray.Excerpts.Where(ex => ex.Notable).OrderByDescending(ex => ex.Highlights);

            foreach (var notable in notablesOnly)
            {
                command.CommandText = $"insert into entity_excerpt (entity, excerpt) values (0, {notable.Id})";
                command.ExecuteNonQuery();
            }

            // Populate some more clips if not enough were found initially
            // TODO: Add a config value in settings for this amount
            const int minimumNotables = 20;
            var       toAdd           = new List <Excerpt>(minimumNotables);
            var       foundNotables   = xray.Excerpts.Count(excerpt => excerpt.Notable);

            switch (foundNotables)
            {
            case <= minimumNotables when foundNotables + xray.Excerpts.Count <= minimumNotables:
                toAdd.AddRange(xray.Excerpts);
                break;

            case <= minimumNotables:
            {
                var rand     = new Random();
                var eligible = xray.Excerpts.Where(ex => !ex.Notable).ToList();
                while (foundNotables <= minimumNotables && eligible.Count > 0)
                {
                    var randEx = eligible.ElementAt(rand.Next(eligible.Count));
                    toAdd.Add(randEx);
                    eligible.Remove(randEx);
                    foundNotables++;
                }

                break;
            }
            }
            foreach (var excerpt in toAdd)
            {
                command.CommandText = $"insert into entity_excerpt (entity, excerpt) values (0, {excerpt.Id})";
                command.ExecuteNonQuery();
            }
            command.Dispose();

            token.ThrowIfCancellationRequested();
            _logger.Log("Writing top mentions...");
            var sorted =
                xray.Terms.Where(t => t.Type.Equals("character"))
                .OrderByDescending(t => t.Occurrences.Count)
                .Select(t => t.Id)
                .ToList();

            sql.Clear();
            sql.AppendFormat("update type set top_mentioned_entities='{0}' where id=1;\n",
                             string.Join(",", sorted.GetRange(0, Math.Min(10, sorted.Count))));
            sorted =
                xray.Terms.Where(t => t.Type.Equals("topic"))
                .OrderByDescending(t => t.Occurrences.Count)
                .Select(t => t.Id)
                .ToList();
            sql.AppendFormat("update type set top_mentioned_entities='{0}' where id=2;",
                             string.Join(",", sorted.GetRange(0, Math.Min(10, sorted.Count))));
            command = new SQLiteCommand(sql.ToString(), db);
            command.ExecuteNonQuery();
            command.Dispose();

            token.ThrowIfCancellationRequested();
            _logger.Log("Writing metadata...");

            sql.Clear();
            sql.AppendFormat(
                "insert into book_metadata (srl, erl, has_images, has_excerpts, show_spoilers_default, num_people, num_terms, num_images, preview_images) "
                + "values ({0}, {1}, 0, 1, 0, {2}, {3}, 0, null);", xray.Srl, xray.Erl, personCount, termCount);

            command = new SQLiteCommand(sql.ToString(), db);
            command.ExecuteNonQuery();
            command.Dispose();
        }