/// <summary> /// heuristically merge a list of TextElement into a list of TextChunk /// ported from PDFBox's PDFTextStripper.writePage, with modifications. /// Here be dragons /// </summary> /// <param name="textElements"></param> /// <param name="verticalRulings"></param> public static List <TextChunk> MergeWords(IReadOnlyList <TextElement> textElements, IReadOnlyList <Ruling> verticalRulings) { List <TextChunk> textChunks = new List <TextChunk>(); if (textElements.Count == 0) { return(textChunks); } textChunks.Add(new TextChunk(textElements[0])); TextChunk firstTC = textChunks[0]; double previousAveCharWidth = firstTC.Width; double endOfLastTextX = firstTC.Right; double maxYForLine = firstTC.Top; //.getBottom(); double maxHeightForLine = firstTC.Height; double minYTopForLine = firstTC.Bottom; //.getTop(); double lastWordSpacing = -1; double wordSpacing, deltaSpace, averageCharWidth, deltaCharWidth; double expectedStartOfNextWordX, dist; TextElement sp, prevChar; TextChunk currentChunk; bool sameLine, acrossVerticalRuling; foreach (TextElement chr in textElements.Skip(1)) { currentChunk = textChunks[textChunks.Count - 1]; prevChar = currentChunk.TextElements[currentChunk.TextElements.Count - 1]; // if same char AND overlapped, skip if (chr.GetText().Equals(prevChar.GetText()) && (prevChar.OverlapRatio(chr) > 0.5)) { continue; } // if chr is a space that overlaps with prevChar, skip if (chr.GetText().Equals(" ") && Utils.Feq(prevChar.Left, chr.Left) && Utils.Feq(prevChar.Bottom, chr.Bottom)) // getTop() getTop() { continue; } // Resets the average character width when we see a change in font // or a change in the font size if ((chr.Font != prevChar.Font) || !Utils.Feq(chr.FontSize, prevChar.FontSize)) { previousAveCharWidth = -1; } // is there any vertical ruling that goes across chr and prevChar? acrossVerticalRuling = false; foreach (Ruling r in verticalRulings) { if (VerticallyOverlapsRuling(prevChar, r) && VerticallyOverlapsRuling(chr, r) && prevChar.X < r.Position && chr.X > r.Position || (prevChar.X > r.Position && chr.X < r.Position)) { acrossVerticalRuling = true; break; } } // Estimate the expected width of the space based on the // space character with some margin. wordSpacing = chr.WidthOfSpace; deltaSpace = 0; if (double.IsNaN(wordSpacing) || wordSpacing == 0) { deltaSpace = double.MaxValue; } else if (lastWordSpacing < 0) { deltaSpace = wordSpacing * 0.5f; // 0.5 == spacing tolerance } else { deltaSpace = ((wordSpacing + lastWordSpacing) / 2.0f) * 0.5f; } // Estimate the expected width of the space based on the // average character width with some margin. This calculation does not // make a true average (average of averages) but we found that it gave the // best results after numerous experiments. Based on experiments we also found that // .3 worked well. if (previousAveCharWidth < 0) { averageCharWidth = chr.Width / chr.GetText().Length; } else { averageCharWidth = (previousAveCharWidth + (chr.Width / chr.GetText().Length)) / 2.0f; } deltaCharWidth = averageCharWidth * AVERAGE_CHAR_TOLERANCE; // Compares the values obtained by the average method and the wordSpacing method and picks // the smaller number. expectedStartOfNextWordX = -double.MaxValue; if (endOfLastTextX != -1) { expectedStartOfNextWordX = endOfLastTextX + Math.Min(deltaCharWidth, deltaSpace); } // new line? sameLine = true; if (!Utils.Overlap(chr.Top, chr.Height, maxYForLine, maxHeightForLine)) // getBottom() { endOfLastTextX = -1; expectedStartOfNextWordX = -double.MaxValue; maxYForLine = -double.MaxValue; maxHeightForLine = -1; minYTopForLine = double.MaxValue; sameLine = false; } endOfLastTextX = chr.Right; // should we add a space? if (!acrossVerticalRuling && sameLine && expectedStartOfNextWordX < chr.Left && !prevChar.GetText().EndsWith(" ")) { sp = new TextElement( new PdfRectangle(prevChar.BoundingBox.BottomLeft, new PdfPoint(expectedStartOfNextWordX, prevChar.BoundingBox.TopRight.Y)), prevChar.Font, prevChar.FontSize, " ", prevChar.WidthOfSpace, 0); currentChunk.Add(sp); } else { sp = null; } maxYForLine = Math.Max(chr.Top, maxYForLine); // getBottom() maxHeightForLine = Math.Max(maxHeightForLine, chr.Height); minYTopForLine = Math.Min(minYTopForLine, chr.Bottom); // .getTop() dist = chr.Left - (sp != null ? sp.Right : prevChar.Right); // added by BobLd // handle cases where order of character is not good, implement quicksort??? if (dist < -wordSpacing) { dist = double.MaxValue; // force create new word because testColumnRecognition() fails } // end added if (!acrossVerticalRuling && sameLine && (dist < 0 ? currentChunk.VerticallyOverlaps(chr) : dist < wordSpacing)) { currentChunk.Add(chr); } else { // create a new chunk textChunks.Add(new TextChunk(chr)); } lastWordSpacing = wordSpacing; previousAveCharWidth = sp != null ? (averageCharWidth + sp.Width) / 2.0f : averageCharWidth; } List <TextChunk> textChunksSeparatedByDirectionality = new List <TextChunk>(); // count up characters by directionality foreach (TextChunk chunk in textChunks) { // choose the dominant direction bool isLtrDominant = chunk.IsLtrDominant() != -1; // treat neutral as LTR TextChunk dirChunk = chunk.GroupByDirectionality(isLtrDominant); textChunksSeparatedByDirectionality.Add(dirChunk); } return(textChunksSeparatedByDirectionality); }
/// <summary> /// Merges this TextChunk with the other. /// <para>Also does it in place.</para> /// </summary> /// <param name="other"></param> public TextChunk Merge(TextChunk other) { base.Merge(other); return(this); }