示例#1
0
        public static String GetResultantText(this LocationTextExtractionStrategy strategy, Rectangle rect)
        {
            IList <TextChunk> locationalResult = (IList <TextChunk>)locationalResultField.GetValue(strategy);
            List <TextChunk>  nonMatching      = new List <TextChunk>();

            foreach (TextChunk chunk in locationalResult)
            {
                ITextChunkLocation location = chunk.GetLocation();
                Vector             start    = location.GetStartLocation();
                Vector             end      = location.GetEndLocation();
                if (!rect.IntersectsLine(start.Get(Vector.I1), start.Get(Vector.I2), end.Get(Vector.I1), end.Get(Vector.I2)))
                {
                    nonMatching.Add(chunk);
                }
            }
            nonMatching.ForEach(c => locationalResult.Remove(c));
            try
            {
                return(strategy.GetResultantText());
            }
            finally
            {
                nonMatching.ForEach(c => locationalResult.Add(c));
            }
        }
示例#2
0
        //Override for GetResultantText
        public static String GetResultantText(this LocationTextExtractionStrategy strategy, Rectangle rect)
        {
            //Get chunks of text from extraction strategy
            IList <TextChunk> locationalResult = (IList <TextChunk>)locationalResultField.GetValue(strategy);
            //Make container list to store chunks that do not match
            List <TextChunk> nonMatching = new List <TextChunk>();

            //For reach chunk in extraction strategy
            foreach (TextChunk chunk in locationalResult)
            {
                //Get chunk location
                ITextChunkLocation location = chunk.GetLocation();
                //Make start and end vectors
                Vector start = location.GetStartLocation();
                Vector end   = location.GetEndLocation();
                //check if asked rectangle is NOT intersecting current chunk of text
                if (!rect.IntersectsLine(start.Get(Vector.I1), start.Get(Vector.I2), end.Get(Vector.I1), end.Get(Vector.I2)))
                {
                    //if rectangle is not containing this chunk add to nonMatching
                    nonMatching.Add(chunk);
                }
            }
            //For each element in nonMatching remove from locationalResult collection
            nonMatching.ForEach(c => locationalResult.Remove(c));
            try
            {
                //Try returning value if something remain
                return(strategy.GetResultantText());
            }
            finally
            {
                //Return everything from non matching to localResult if there is no return value.
                nonMatching.ForEach(c => locationalResult.Add(c));
            }
        }
示例#3
0
        protected override bool IsChunkAtWordBoundary(TextChunk chunk, TextChunk previousChunk)
        {
            ITextChunkLocation curLoc  = chunk.GetLocation();
            ITextChunkLocation prevLoc = previousChunk.GetLocation();

            if (curLoc.GetStartLocation().Equals(curLoc.GetEndLocation()) || prevLoc.GetEndLocation().Equals(prevLoc.GetStartLocation
                                                                                                                 ()))
            {
                return(false);
            }
            return(curLoc.DistParallelEnd() - prevLoc.DistParallelStart() > (curLoc.GetCharSpaceWidth() + prevLoc.GetCharSpaceWidth
                                                                                 ()) / 2.0f);
        }
    public virtual bool SameLine(ITextChunkLocation @as)
    {
        if (OrientationMagnitude() != @as.OrientationMagnitude())
        {
            return(false);
        }
        int distPerpendicularDiff = DistPerpendicular() - @as.DistPerpendicular();

        if (Math.Abs(distPerpendicularDiff) < 2)
        {
            return(true);
        }
        LineSegment mySegment    = new LineSegment(startLocation, endLocation);
        LineSegment otherSegment = new LineSegment(@as.GetStartLocation(), @as.GetEndLocation());

        return(Math.Abs(distPerpendicularDiff) <= DIACRITICAL_MARKS_ALLOWED_VERTICAL_DEVIATION && (mySegment.GetLength() == 0 || otherSegment.GetLength() == 0));
    }
    public virtual bool IsAtWordBoundary(ITextChunkLocation previous)
    {
        if (startLocation.Equals(endLocation) || previous.GetEndLocation().Equals(previous.GetStartLocation()))
        {
            return(false);
        }
        float dist = DistanceFromEndOf(previous);

        if (dist < 0)
        {
            dist = previous.DistanceFromEndOf(this);
            //The situation when the chunks intersect. We don't need to add space in this case
            if (dist < 0)
            {
                return(false);
            }
        }
        return(dist > GetCharSpaceWidth() / 2.0f);
    }
        public virtual bool IsAtWordBoundary(ITextChunkLocation previous)
        {
            // In case a text chunk is of zero length, this probably means this is a mark character,
            // and we do not actually want to insert a space in such case
            if (startLocation.Equals(endLocation) || previous.GetEndLocation().Equals(previous.GetStartLocation()))
            {
                return(false);
            }
            float dist = DistanceFromEndOf(previous);

            if (dist < 0)
            {
                dist = previous.DistanceFromEndOf(this);
                //The situation when the chunks intersect. We don't need to add space in this case
                if (dist < 0)
                {
                    return(false);
                }
            }
            return(dist > GetCharSpaceWidth() / 2.0f);
        }
        public virtual bool IsAtWordBoundary(ITextChunkLocation previous)
        {
            /*
             * Here we handle a very specific case which in PDF may look like:
             * -.232 Tc [( P)-226.2(r)-231.8(e)-230.8(f)-238(a)-238.9(c)-228.9(e)]TJ
             * The font's charSpace width is 0.232 and it's compensated with charSpacing of 0.232.
             * And a resultant TextChunk.charSpaceWidth comes to TextChunk constructor as 0.
             * In this case every chunk is considered as a word boundary and space is added.
             * We should consider charSpaceWidth equal (or close) to zero as a no-space.
             */
            if (GetCharSpaceWidth() < 0.1f)
            {
                return(false);
            }
            // In case a text chunk is of zero length, this probably means this is a mark character,
            // and we do not actually want to insert a space in such case
            if (startLocation.Equals(endLocation) || previous.GetEndLocation().Equals(previous.GetStartLocation()))
            {
                return(false);
            }
            float dist = DistanceFromEndOf(previous);

            return(dist < -GetCharSpaceWidth() || dist > GetCharSpaceWidth() / 2.0f);
        }
 internal static bool ContainsMark(ITextChunkLocation baseLocation, ITextChunkLocation markLocation)
 {
     return(baseLocation.GetStartLocation().Get(Vector.I1) <= markLocation.GetStartLocation().Get(Vector.I1) &&
            baseLocation.GetEndLocation().Get(Vector.I1) >= markLocation.GetEndLocation().Get(Vector.I1) && Math.
            Abs(baseLocation.DistPerpendicular() - markLocation.DistPerpendicular()) <= DIACRITICAL_MARKS_ALLOWED_VERTICAL_DEVIATION);
 }
示例#9
0
 internal virtual void PrintDiagnostics()
 {
     System.Console.Out.WriteLine("Text (@" + location.GetStartLocation() + " -> " + location.GetEndLocation()
                                  + "): " + text);
     System.Console.Out.WriteLine("orientationMagnitude: " + location.OrientationMagnitude());
     System.Console.Out.WriteLine("distPerpendicular: " + location.DistPerpendicular());
     System.Console.Out.WriteLine("distParallel: " + location.DistParallelStart());
 }
示例#10
0
        private void SortWithMarks(IList <TextChunk> textChunks)
        {
            IDictionary <TextChunk, LocationTextExtractionStrategy.TextChunkMarks> marks = new Dictionary <TextChunk, LocationTextExtractionStrategy.TextChunkMarks
                                                                                                           >();
            IList <TextChunk> toSort = new List <TextChunk>();

            for (int markInd = 0; markInd < textChunks.Count; markInd++)
            {
                ITextChunkLocation location = textChunks[markInd].GetLocation();
                if (location.GetStartLocation().Equals(location.GetEndLocation()))
                {
                    bool foundBaseToAttachTo = false;
                    for (int baseInd = 0; baseInd < textChunks.Count; baseInd++)
                    {
                        if (markInd != baseInd)
                        {
                            ITextChunkLocation baseLocation = textChunks[baseInd].GetLocation();
                            if (!baseLocation.GetStartLocation().Equals(baseLocation.GetEndLocation()) && TextChunkLocationDefaultImp.
                                ContainsMark(baseLocation, location))
                            {
                                LocationTextExtractionStrategy.TextChunkMarks currentMarks = marks.Get(textChunks[baseInd]);
                                if (currentMarks == null)
                                {
                                    currentMarks = new LocationTextExtractionStrategy.TextChunkMarks();
                                    marks.Put(textChunks[baseInd], currentMarks);
                                }
                                if (markInd < baseInd)
                                {
                                    currentMarks.preceding.Add(textChunks[markInd]);
                                }
                                else
                                {
                                    currentMarks.succeeding.Add(textChunks[markInd]);
                                }
                                foundBaseToAttachTo = true;
                                break;
                            }
                        }
                    }
                    if (!foundBaseToAttachTo)
                    {
                        toSort.Add(textChunks[markInd]);
                    }
                }
                else
                {
                    toSort.Add(textChunks[markInd]);
                }
            }
            JavaCollectionsUtil.Sort(toSort, new TextChunkLocationBasedComparator(new DefaultTextChunkLocationComparator
                                                                                      (!rightToLeftRunDirection)));
            textChunks.Clear();
            foreach (TextChunk current in toSort)
            {
                LocationTextExtractionStrategy.TextChunkMarks currentMarks = marks.Get(current);
                if (currentMarks != null)
                {
                    if (!rightToLeftRunDirection)
                    {
                        for (int j = 0; j < currentMarks.preceding.Count; j++)
                        {
                            textChunks.Add(currentMarks.preceding[j]);
                        }
                    }
                    else
                    {
                        for (int j = currentMarks.succeeding.Count - 1; j >= 0; j--)
                        {
                            textChunks.Add(currentMarks.succeeding[j]);
                        }
                    }
                }
                textChunks.Add(current);
                if (currentMarks != null)
                {
                    if (!rightToLeftRunDirection)
                    {
                        for (int j = 0; j < currentMarks.succeeding.Count; j++)
                        {
                            textChunks.Add(currentMarks.succeeding[j]);
                        }
                    }
                    else
                    {
                        for (int j = currentMarks.preceding.Count - 1; j >= 0; j--)
                        {
                            textChunks.Add(currentMarks.preceding[j]);
                        }
                    }
                }
            }
        }