public void AddLocations(XRay xray, KfxContainer kfx, bool skipNoLikes, int minClipLen, IProgressBar progress, CancellationToken token) { _logger.Log("Scanning book content..."); var paragraphs = _paragraphsService.GetParagraphs(kfx).ToArray(); // Set start and end of content // TODO Figure out how to identify the first *actual* bit of content after the TOC var last = paragraphs.Last(); xray.Srl = 1; xray.Erl = last.Location + last.Length - 1; progress?.Set(0, paragraphs.Length); foreach (var paragraph in paragraphs) { token.ThrowIfCancellationRequested(); foreach (var character in xray.Terms.Where(term => term.Match)) { var occurrences = _termsService.FindOccurrences(kfx, character, paragraph); if (!occurrences.Any()) { continue; } character.Occurrences.UnionWith(occurrences); ExcerptHelper.EnhanceOrAddExcerpts(xray.Excerpts, character.Id, new IndexLength(paragraph.Location, paragraph.Length)); } // Attempt to match downloaded notable clips, not worried if no matches occur as some will be added later anyway if (xray.NotableClips != null) { ExcerptHelper.ProcessNotablesForParagraph(paragraph.ContentText, paragraph.Location, xray.NotableClips, xray.Excerpts, skipNoLikes, minClipLen); } progress?.Add(1); } var missingOccurrences = xray.Terms .Where(term => term.Match && term.Occurrences.Count == 0) .Select(term => term.TermName) .ToArray(); if (!missingOccurrences.Any()) { return; } var termList = string.Join(", ", missingOccurrences); _logger.Log($"\r\nNo locations were found for the following terms. You should add aliases for them using the book as a reference:\r\n{termList}\r\n"); }
// TODO split this up, possible return a result instead of modifying xray public void ExpandFromRawMl( XRay xray, IMetadata metadata, Stream rawMlStream, bool useNewVersion, bool skipNoLikes, int minClipLen, bool overwriteChapters, Func <bool> editChaptersCallback, IProgressBar progress, CancellationToken token, bool ignoreSoftHypen = false, bool shortEx = true) { // Only load chapters when building the old format if (!useNewVersion) { rawMlStream.Seek(0, SeekOrigin.Begin); // TODO: passing stream, doc, and contents probably not necessary) using var streamReader = new StreamReader(rawMlStream, Encoding.UTF8); var readContents = streamReader.ReadToEnd(); var utf8Doc = new HtmlDocument(); utf8Doc.LoadHtml(readContents); _chaptersService.HandleChapters(xray, xray.Asin, rawMlStream.Length, utf8Doc, readContents, overwriteChapters, editChaptersCallback); } else { // set default ERL to prevent filtering xray.Srl = 1; xray.Erl = rawMlStream.Length; } _logger.Log(CoreStrings.ScanningEbookContent); var timer = new System.Diagnostics.Stopwatch(); timer.Start(); var paragraphs = _paragraphsService.GetParagraphs(metadata).ToArray(); if (!paragraphs.Any()) { throw new Exception(CoreStrings.CouldNotLocateAnyParagraphs); } progress?.Set(0, paragraphs.Length); foreach (var paragraph in paragraphs) { token.ThrowIfCancellationRequested(); //Skip paragraph if outside known chapter range or if html is missing (shouldn't be, just a safety check) if (paragraph.Location < xray.Srl || paragraph.Location > xray.Erl || paragraph.ContentHtml == null) { continue; } var noSoftHypen = ""; if (ignoreSoftHypen) { noSoftHypen = paragraph.ContentText; noSoftHypen = noSoftHypen.Replace("\u00C2\u00AD", ""); noSoftHypen = noSoftHypen.Replace("­", ""); noSoftHypen = noSoftHypen.Replace("­", ""); noSoftHypen = noSoftHypen.Replace("­", ""); noSoftHypen = noSoftHypen.Replace("­", ""); } foreach (var character in xray.Terms) { //Search for character name and aliases in the html-less text. If failed, try in the HTML for rare situations. //TODO: Improve location searching as IndexOf will not work if book length exceeds 2,147,483,647... //If soft hyphen ignoring is turned on, also search hyphen-less text. if (!character.Match) { continue; } var termFound = false; // Convert from UTF8 string to default-encoded representation var search = character.Aliases.Select(alias => _encoding.GetString(Encoding.UTF8.GetBytes(alias))) .ToList(); if (character.RegexAliases) { if (search.Any(r => Regex.Match(paragraph.ContentText, r).Success) || search.Any(r => Regex.Match(paragraph.ContentHtml !, r).Success) || (ignoreSoftHypen && search.Any(r => Regex.Match(noSoftHypen, r).Success))) { termFound = true; } } else { // Search for character name and aliases // If there is an apostrophe, attempt to match 's at the end of the term // Match end of word, then search for any lingering punctuation search.Add(character.TermName); // Search list should be in descending order by length, even the term name itself search = search.OrderByDescending(s => s.Length).ToList(); // TODO consider removing this "termfound" section 'cause it might be redundant and pointless now if ((character.MatchCase && (search.Any(paragraph.ContentText.Contains) || search.Any(paragraph.ContentHtml.Contains))) || (!character.MatchCase && (search.Any(paragraph.ContentText.ContainsIgnorecase) || search.Any(paragraph.ContentHtml.ContainsIgnorecase))) || (ignoreSoftHypen && (character.MatchCase && search.Any(noSoftHypen.Contains)) || (!character.MatchCase && search.Any(noSoftHypen.ContainsIgnorecase)))) { termFound = true; } } if (!termFound) { continue; } var occurrences = _termsService.FindOccurrences(metadata, character, paragraph); if (!occurrences.Any()) { // _logger.Log($"An error occurred while searching for start of highlight.\r\nWas looking for (or one of the aliases of): {character.TermName}\r\nSearching in: {node.InnerHtml}"); continue; } character.Occurrences.UnionWith(occurrences); ExcerptHelper.EnhanceOrAddExcerpts(xray.Excerpts, character.Id, new IndexLength(paragraph.Location, paragraph.Length)); } // Attempt to match downloaded notable clips, not worried if no matches occur as some will be added later anyway if (useNewVersion && xray.NotableClips != null) { ExcerptHelper.ProcessNotablesForParagraph(paragraph.ContentText, paragraph.Location, xray.NotableClips, xray.Excerpts, skipNoLikes, minClipLen); } progress?.Add(1); } timer.Stop(); _logger.Log(string.Format(CoreStrings.ScanTime, timer.Elapsed)); //output list of terms with no occurrences foreach (var t in xray.Terms.Where(t => t.Match && t.Occurrences.Count == 0)) { _logger.Log(string.Format(CoreStrings.NoLocationsFoundForTerm, t.TermName)); } }