public virtual ICollection <IPdfTextLocation> GetResultantLocations() { // align characters in "logical" order JavaCollectionsUtil.Sort(parseResult, new TextChunkLocationBasedComparator(new DefaultTextChunkLocationComparator())); // process parse results IList <IPdfTextLocation> retval = new List <IPdfTextLocation>(); CharacterRenderInfo.StringConversionInfo txt = CharacterRenderInfo.MapString(parseResult); Match mat = iText.IO.Util.StringUtil.Match(pattern, txt.text); while (mat.Success) { int?startIndex = txt.indexMap.Get(mat.Index); int?endIndex = txt.indexMap.Get(mat.Index + mat.Length); foreach (Rectangle r in ToRectangles(parseResult.SubList(startIndex.Value, endIndex.Value))) { retval.Add(new DefaultPdfTextLocation(0, r, iText.IO.Util.StringUtil.Group(mat, 0))); } mat = mat.NextMatch(); } /* sort * even though the return type is Collection<Rectangle>, we apply a sorting algorithm here * This is to ensure that tests that use this functionality (for instance to generate pdf with * areas of interest highlighted) will not break when compared. */ JavaCollectionsUtil.Sort(retval, new _IComparer_54()); return(retval); }
/// <summary> /// This method converts a List<CharacterRenderInfo> /// The data structure that gets returned contains both the plaintext, /// as well as the mapping of indices (from the list to the string). /// </summary> /// <remarks> /// This method converts a List<CharacterRenderInfo> /// The data structure that gets returned contains both the plaintext, /// as well as the mapping of indices (from the list to the string). /// These indices can differ; if there is sufficient spacing between two CharacterRenderInfo /// objects, this algorithm will decide to insert space. The inserted space will cause /// the indices to differ by at least 1. /// </remarks> internal static CharacterRenderInfo.StringConversionInfo MapString(IList <iText.Kernel.Pdf.Canvas.Parser.Listener.CharacterRenderInfo > cris) { IDictionary <int, int?> indexMap = new Dictionary <int, int?>(); StringBuilder sb = new StringBuilder(); iText.Kernel.Pdf.Canvas.Parser.Listener.CharacterRenderInfo lastChunk = null; for (int i = 0; i < cris.Count; i++) { iText.Kernel.Pdf.Canvas.Parser.Listener.CharacterRenderInfo chunk = cris[i]; if (lastChunk == null) { indexMap.Put(sb.Length, i); sb.Append(chunk.GetText()); } else { if (chunk.SameLine(lastChunk)) { // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space if (chunk.GetLocation().IsAtWordBoundary(lastChunk.GetLocation()) && !chunk.GetText().StartsWith(" ") && ! chunk.GetText().EndsWith(" ")) { sb.Append(' '); } indexMap.Put(sb.Length, i); sb.Append(chunk.GetText()); } else { indexMap.Put(sb.Length, i); sb.Append(chunk.GetText()); } } lastChunk = chunk; } CharacterRenderInfo.StringConversionInfo ret = new CharacterRenderInfo.StringConversionInfo(); ret.indexMap = indexMap; ret.text = sb.ToString(); return(ret); }