//Override for GetResultantText public static String GetResultantText(this LocationTextExtractionStrategy strategy, Rectangle rect) { //Get chunks of text from extraction strategy IList <TextChunk> locationalResult = (IList <TextChunk>)locationalResultField.GetValue(strategy); //Make container list to store chunks that do not match List <TextChunk> nonMatching = new List <TextChunk>(); //For reach chunk in extraction strategy foreach (TextChunk chunk in locationalResult) { //Get chunk location ITextChunkLocation location = chunk.GetLocation(); //Make start and end vectors Vector start = location.GetStartLocation(); Vector end = location.GetEndLocation(); //check if asked rectangle is NOT intersecting current chunk of text if (!rect.IntersectsLine(start.Get(Vector.I1), start.Get(Vector.I2), end.Get(Vector.I1), end.Get(Vector.I2))) { //if rectangle is not containing this chunk add to nonMatching nonMatching.Add(chunk); } } //For each element in nonMatching remove from locationalResult collection nonMatching.ForEach(c => locationalResult.Remove(c)); try { //Try returning value if something remain return(strategy.GetResultantText()); } finally { //Return everything from non matching to localResult if there is no return value. nonMatching.ForEach(c => locationalResult.Add(c)); } }
public static String GetResultantText(this LocationTextExtractionStrategy strategy, Rectangle rect) { IList <TextChunk> locationalResult = (IList <TextChunk>)locationalResultField.GetValue(strategy); List <TextChunk> nonMatching = new List <TextChunk>(); foreach (TextChunk chunk in locationalResult) { ITextChunkLocation location = chunk.GetLocation(); Vector start = location.GetStartLocation(); Vector end = location.GetEndLocation(); if (!rect.IntersectsLine(start.Get(Vector.I1), start.Get(Vector.I2), end.Get(Vector.I1), end.Get(Vector.I2))) { nonMatching.Add(chunk); } } nonMatching.ForEach(c => locationalResult.Remove(c)); try { return(strategy.GetResultantText()); } finally { nonMatching.ForEach(c => locationalResult.Add(c)); } }
internal virtual void PrintDiagnostics() { System.Console.Out.WriteLine("Text (@" + location.GetStartLocation() + " -> " + location.GetEndLocation() + "): " + text); System.Console.Out.WriteLine("orientationMagnitude: " + location.OrientationMagnitude()); System.Console.Out.WriteLine("distPerpendicular: " + location.DistPerpendicular()); System.Console.Out.WriteLine("distParallel: " + location.DistParallelStart()); }
protected override bool IsChunkAtWordBoundary(TextChunk chunk, TextChunk previousChunk) { ITextChunkLocation curLoc = chunk.GetLocation(); ITextChunkLocation prevLoc = previousChunk.GetLocation(); if (curLoc.GetStartLocation().Equals(curLoc.GetEndLocation()) || prevLoc.GetEndLocation().Equals(prevLoc.GetStartLocation ())) { return(false); } return(curLoc.DistParallelEnd() - prevLoc.DistParallelStart() > (curLoc.GetCharSpaceWidth() + prevLoc.GetCharSpaceWidth ()) / 2.0f); }
public virtual bool SameLine(ITextChunkLocation @as) { if (OrientationMagnitude() != @as.OrientationMagnitude()) { return(false); } int distPerpendicularDiff = DistPerpendicular() - @as.DistPerpendicular(); if (Math.Abs(distPerpendicularDiff) < 2) { return(true); } LineSegment mySegment = new LineSegment(startLocation, endLocation); LineSegment otherSegment = new LineSegment(@as.GetStartLocation(), @as.GetEndLocation()); return(Math.Abs(distPerpendicularDiff) <= DIACRITICAL_MARKS_ALLOWED_VERTICAL_DEVIATION && (mySegment.GetLength() == 0 || otherSegment.GetLength() == 0)); }
public virtual bool IsAtWordBoundary(ITextChunkLocation previous) { if (startLocation.Equals(endLocation) || previous.GetEndLocation().Equals(previous.GetStartLocation())) { return(false); } float dist = DistanceFromEndOf(previous); if (dist < 0) { dist = previous.DistanceFromEndOf(this); //The situation when the chunks intersect. We don't need to add space in this case if (dist < 0) { return(false); } } return(dist > GetCharSpaceWidth() / 2.0f); }
public virtual bool IsAtWordBoundary(ITextChunkLocation previous) { // In case a text chunk is of zero length, this probably means this is a mark character, // and we do not actually want to insert a space in such case if (startLocation.Equals(endLocation) || previous.GetEndLocation().Equals(previous.GetStartLocation())) { return(false); } float dist = DistanceFromEndOf(previous); if (dist < 0) { dist = previous.DistanceFromEndOf(this); //The situation when the chunks intersect. We don't need to add space in this case if (dist < 0) { return(false); } } return(dist > GetCharSpaceWidth() / 2.0f); }
internal static bool ContainsMark(ITextChunkLocation baseLocation, ITextChunkLocation markLocation) { return(baseLocation.GetStartLocation().Get(Vector.I1) <= markLocation.GetStartLocation().Get(Vector.I1) && baseLocation.GetEndLocation().Get(Vector.I1) >= markLocation.GetEndLocation().Get(Vector.I1) && Math. Abs(baseLocation.DistPerpendicular() - markLocation.DistPerpendicular()) <= DIACRITICAL_MARKS_ALLOWED_VERTICAL_DEVIATION); }
private void SortWithMarks(IList <TextChunk> textChunks) { IDictionary <TextChunk, LocationTextExtractionStrategy.TextChunkMarks> marks = new Dictionary <TextChunk, LocationTextExtractionStrategy.TextChunkMarks >(); IList <TextChunk> toSort = new List <TextChunk>(); for (int markInd = 0; markInd < textChunks.Count; markInd++) { ITextChunkLocation location = textChunks[markInd].GetLocation(); if (location.GetStartLocation().Equals(location.GetEndLocation())) { bool foundBaseToAttachTo = false; for (int baseInd = 0; baseInd < textChunks.Count; baseInd++) { if (markInd != baseInd) { ITextChunkLocation baseLocation = textChunks[baseInd].GetLocation(); if (!baseLocation.GetStartLocation().Equals(baseLocation.GetEndLocation()) && TextChunkLocationDefaultImp. ContainsMark(baseLocation, location)) { LocationTextExtractionStrategy.TextChunkMarks currentMarks = marks.Get(textChunks[baseInd]); if (currentMarks == null) { currentMarks = new LocationTextExtractionStrategy.TextChunkMarks(); marks.Put(textChunks[baseInd], currentMarks); } if (markInd < baseInd) { currentMarks.preceding.Add(textChunks[markInd]); } else { currentMarks.succeeding.Add(textChunks[markInd]); } foundBaseToAttachTo = true; break; } } } if (!foundBaseToAttachTo) { toSort.Add(textChunks[markInd]); } } else { toSort.Add(textChunks[markInd]); } } JavaCollectionsUtil.Sort(toSort, new TextChunkLocationBasedComparator(new DefaultTextChunkLocationComparator (!rightToLeftRunDirection))); textChunks.Clear(); foreach (TextChunk current in toSort) { LocationTextExtractionStrategy.TextChunkMarks currentMarks = marks.Get(current); if (currentMarks != null) { if (!rightToLeftRunDirection) { for (int j = 0; j < currentMarks.preceding.Count; j++) { textChunks.Add(currentMarks.preceding[j]); } } else { for (int j = currentMarks.succeeding.Count - 1; j >= 0; j--) { textChunks.Add(currentMarks.succeeding[j]); } } } textChunks.Add(current); if (currentMarks != null) { if (!rightToLeftRunDirection) { for (int j = 0; j < currentMarks.succeeding.Count; j++) { textChunks.Add(currentMarks.succeeding[j]); } } else { for (int j = currentMarks.preceding.Count - 1; j >= 0; j--) { textChunks.Add(currentMarks.preceding[j]); } } } } }
public virtual bool IsAtWordBoundary(ITextChunkLocation previous) { /* * Here we handle a very specific case which in PDF may look like: * -.232 Tc [( P)-226.2(r)-231.8(e)-230.8(f)-238(a)-238.9(c)-228.9(e)]TJ * The font's charSpace width is 0.232 and it's compensated with charSpacing of 0.232. * And a resultant TextChunk.charSpaceWidth comes to TextChunk constructor as 0. * In this case every chunk is considered as a word boundary and space is added. * We should consider charSpaceWidth equal (or close) to zero as a no-space. */ if (GetCharSpaceWidth() < 0.1f) { return(false); } // In case a text chunk is of zero length, this probably means this is a mark character, // and we do not actually want to insert a space in such case if (startLocation.Equals(endLocation) || previous.GetEndLocation().Equals(previous.GetStartLocation())) { return(false); } float dist = DistanceFromEndOf(previous); return(dist < -GetCharSpaceWidth() || dist > GetCharSpaceWidth() / 2.0f); }