public void Append(string content) { if (Html.Length > 0) { Html.Append("\r\n[...] "); } var tag = new HtmlTagSpan(new Span(Html.Length, Html.Length + content.Length - 1)); HtmlTags.Add(tag); Html.Append(content); }
private void Append(int pageIdx, Span span, StringBuilder str) { PdfPage page = Document.Pages[pageIdx]; PdfText pdfText = page.Text; // Get all text objects TextObject GetTextObject(PdfTextObject textObj) { var bbox = textObj.GetCharRect(0); var absIdx = pdfText.GetCharIndexAtPos(bbox.left, bbox.top, 1, 1); return(new TextObject(textObj, absIdx)); } var textObjects = page.PageObjects .Where(o => o.ObjectType == PageObjectTypes.PDFPAGE_TEXT) .Select(o => GetTextObject((PdfTextObject)o)) .OrderBy(t => t.StartIndex) .ToList(); // Some PDF documents are improperly formatted and miss PdfTextObjects -- Fill the gaps int lastEndIdx = textObjects.FirstOrDefault()?.EndIndex ?? 0; for (int i = 1; i < textObjects.Count; i++) { var textObj = textObjects[i]; if (textObj.StartIndex <= lastEndIdx + 1) // This shouldn't be < -- But allow it nevertheless { lastEndIdx = textObj.EndIndex; continue; } var gapTextObj = new TextObject(textObjects[i - 1], lastEndIdx + 1, textObj.StartIndex - lastEndIdx - 1); textObjects.Insert(i++, gapTextObj); lastEndIdx = textObj.EndIndex; } // Build the HTML tags int shift = str.Length; foreach (var textObj in textObjects) { // Check overlap var objSpan = new Span(textObj.StartIndex, textObj.StartIndex + textObj.Length - 1); if (objSpan.Overlaps(span, out var overlap) == false) { continue; } // Look behind for line return, and extend span for inclusion -- Unlike PdfTextObjects, GetText includes \r\n int lookbackIdx = textObj.StartIndex - 2; int tagStartIdxExtendBehind = 0; if (lookbackIdx >= span.StartIdx && pdfText.GetText(lookbackIdx, 2) is "\r\n") { tagStartIdxExtendBehind = -2; } // Generate text object tag var relStartIdx = shift + overlap.StartIdx - span.StartIdx; var tag = new HtmlTagSpan(new Span(relStartIdx + tagStartIdxExtendBehind, relStartIdx + overlap.Length - 1)) .WithStyle(s => SetTextStyle(s, textObj)); HtmlTags.Add(tag); // Generate extract tag if (OverlapsWithExtract(pageIdx, overlap, out var extractOverlaps)) { foreach (var extractOverlap in extractOverlaps) { int extractStartIdx = shift + extractOverlap.StartIdx - span.StartIdx; var extractSpan = new Span(extractStartIdx, extractStartIdx + extractOverlap.Length - 1); var extractTag = new HtmlTagSpan(extractSpan, 100); extractTag.WithStyle(s => s.WithBackgroundColorColor(extractOverlap.Object)); HtmlTags.Add(extractTag); } } } str.Append(pdfText.GetText(span.StartIdx, span.Length)); }