예제 #1
0
        public void AddLocations(XRay xray,
                                 KfxContainer kfx,
                                 bool skipNoLikes,
                                 int minClipLen,
                                 IProgressBar progress,
                                 CancellationToken token)
        {
            _logger.Log("Scanning book content...");

            var paragraphs = _paragraphsService.GetParagraphs(kfx).ToArray();

            // Set start and end of content
            // TODO Figure out how to identify the first *actual* bit of content after the TOC
            var last = paragraphs.Last();

            xray.Srl = 1;
            xray.Erl = last.Location + last.Length - 1;

            progress?.Set(0, paragraphs.Length);
            foreach (var paragraph in paragraphs)
            {
                token.ThrowIfCancellationRequested();

                foreach (var character in xray.Terms.Where(term => term.Match))
                {
                    var occurrences = _termsService.FindOccurrences(kfx, character, paragraph);
                    if (!occurrences.Any())
                    {
                        continue;
                    }

                    character.Occurrences.UnionWith(occurrences);

                    ExcerptHelper.EnhanceOrAddExcerpts(xray.Excerpts, character.Id, new IndexLength(paragraph.Location, paragraph.Length));
                }

                // Attempt to match downloaded notable clips, not worried if no matches occur as some will be added later anyway
                if (xray.NotableClips != null)
                {
                    ExcerptHelper.ProcessNotablesForParagraph(paragraph.ContentText, paragraph.Location, xray.NotableClips, xray.Excerpts, skipNoLikes, minClipLen);
                }

                progress?.Add(1);
            }

            var missingOccurrences = xray.Terms
                                     .Where(term => term.Match && term.Occurrences.Count == 0)
                                     .Select(term => term.TermName)
                                     .ToArray();

            if (!missingOccurrences.Any())
            {
                return;
            }

            var termList = string.Join(", ", missingOccurrences);

            _logger.Log($"\r\nNo locations were found for the following terms. You should add aliases for them using the book as a reference:\r\n{termList}\r\n");
        }
예제 #2
0
        // TODO split this up, possible return a result instead of modifying xray
        public void ExpandFromRawMl(
            XRay xray,
            IMetadata metadata,
            Stream rawMlStream,
            bool useNewVersion,
            bool skipNoLikes,
            int minClipLen,
            bool overwriteChapters,
            Func <bool> editChaptersCallback,
            IProgressBar progress,
            CancellationToken token,
            bool ignoreSoftHypen = false,
            bool shortEx         = true)
        {
            // Only load chapters when building the old format
            if (!useNewVersion)
            {
                rawMlStream.Seek(0, SeekOrigin.Begin);
                // TODO: passing stream, doc, and contents probably not necessary)
                using var streamReader = new StreamReader(rawMlStream, Encoding.UTF8);
                var readContents = streamReader.ReadToEnd();
                var utf8Doc      = new HtmlDocument();
                utf8Doc.LoadHtml(readContents);

                _chaptersService.HandleChapters(xray, xray.Asin, rawMlStream.Length, utf8Doc, readContents, overwriteChapters, editChaptersCallback);
            }
            else
            {
                // set default ERL to prevent filtering
                xray.Srl = 1;
                xray.Erl = rawMlStream.Length;
            }

            _logger.Log(CoreStrings.ScanningEbookContent);
            var timer = new System.Diagnostics.Stopwatch();

            timer.Start();

            var paragraphs = _paragraphsService.GetParagraphs(metadata).ToArray();

            if (!paragraphs.Any())
            {
                throw new Exception(CoreStrings.CouldNotLocateAnyParagraphs);
            }

            progress?.Set(0, paragraphs.Length);
            foreach (var paragraph in paragraphs)
            {
                token.ThrowIfCancellationRequested();

                //Skip paragraph if outside known chapter range or if html is missing (shouldn't be, just a safety check)
                if (paragraph.Location < xray.Srl || paragraph.Location > xray.Erl || paragraph.ContentHtml == null)
                {
                    continue;
                }

                var noSoftHypen = "";
                if (ignoreSoftHypen)
                {
                    noSoftHypen = paragraph.ContentText;
                    noSoftHypen = noSoftHypen.Replace("\u00C2\u00AD", "");
                    noSoftHypen = noSoftHypen.Replace("&shy;", "");
                    noSoftHypen = noSoftHypen.Replace("&#xad;", "");
                    noSoftHypen = noSoftHypen.Replace("&#173;", "");
                    noSoftHypen = noSoftHypen.Replace("&#0173;", "");
                }

                foreach (var character in xray.Terms)
                {
                    //Search for character name and aliases in the html-less text. If failed, try in the HTML for rare situations.
                    //TODO: Improve location searching as IndexOf will not work if book length exceeds 2,147,483,647...
                    //If soft hyphen ignoring is turned on, also search hyphen-less text.
                    if (!character.Match)
                    {
                        continue;
                    }

                    var termFound = false;
                    // Convert from UTF8 string to default-encoded representation
                    var search = character.Aliases.Select(alias => _encoding.GetString(Encoding.UTF8.GetBytes(alias)))
                                 .ToList();
                    if (character.RegexAliases)
                    {
                        if (search.Any(r => Regex.Match(paragraph.ContentText, r).Success) ||
                            search.Any(r => Regex.Match(paragraph.ContentHtml !, r).Success) ||
                            (ignoreSoftHypen && search.Any(r => Regex.Match(noSoftHypen, r).Success)))
                        {
                            termFound = true;
                        }
                    }
                    else
                    {
                        // Search for character name and aliases
                        // If there is an apostrophe, attempt to match 's at the end of the term
                        // Match end of word, then search for any lingering punctuation
                        search.Add(character.TermName);
                        // Search list should be in descending order by length, even the term name itself
                        search = search.OrderByDescending(s => s.Length).ToList();

                        // TODO consider removing this "termfound" section 'cause it might be redundant and pointless now
                        if ((character.MatchCase && (search.Any(paragraph.ContentText.Contains) || search.Any(paragraph.ContentHtml.Contains))) ||
                            (!character.MatchCase && (search.Any(paragraph.ContentText.ContainsIgnorecase) || search.Any(paragraph.ContentHtml.ContainsIgnorecase))) ||
                            (ignoreSoftHypen && (character.MatchCase && search.Any(noSoftHypen.Contains)) ||
                             (!character.MatchCase && search.Any(noSoftHypen.ContainsIgnorecase))))
                        {
                            termFound = true;
                        }
                    }

                    if (!termFound)
                    {
                        continue;
                    }

                    var occurrences = _termsService.FindOccurrences(metadata, character, paragraph);
                    if (!occurrences.Any())
                    {
                        // _logger.Log($"An error occurred while searching for start of highlight.\r\nWas looking for (or one of the aliases of): {character.TermName}\r\nSearching in: {node.InnerHtml}");
                        continue;
                    }

                    character.Occurrences.UnionWith(occurrences);

                    ExcerptHelper.EnhanceOrAddExcerpts(xray.Excerpts, character.Id, new IndexLength(paragraph.Location, paragraph.Length));
                }

                // Attempt to match downloaded notable clips, not worried if no matches occur as some will be added later anyway
                if (useNewVersion && xray.NotableClips != null)
                {
                    ExcerptHelper.ProcessNotablesForParagraph(paragraph.ContentText, paragraph.Location, xray.NotableClips, xray.Excerpts, skipNoLikes, minClipLen);
                }

                progress?.Add(1);
            }

            timer.Stop();
            _logger.Log(string.Format(CoreStrings.ScanTime, timer.Elapsed));
            //output list of terms with no occurrences
            foreach (var t in xray.Terms.Where(t => t.Match && t.Occurrences.Count == 0))
            {
                _logger.Log(string.Format(CoreStrings.NoLocationsFoundForTerm, t.TermName));
            }
        }