Пример #1
0
            /**
             * Compares based on orientation, perpendicular distance, then parallel distance
             * @see java.lang.Comparable#compareTo(java.lang.Object)
             */
            public virtual int CompareTo(ITextChunkLocation other)
            {
                if (this == other)
                {
                    return(0);               // not really needed, but just in case
                }
                int rslt;

                rslt = CompareInts(OrientationMagnitude, other.OrientationMagnitude);
                if (rslt != 0)
                {
                    return(rslt);
                }

                rslt = CompareInts(DistPerpendicular, other.DistPerpendicular);
                if (rslt != 0)
                {
                    return(rslt);
                }

                // note: it's never safe to check floating point numbers for equality, and if two chunks
                // are truly right on top of each other, which one comes first or second just doesn't matter
                // so we arbitrarily choose this way.
                rslt = DistParallelStart < other.DistParallelStart ? -1 : 1;

                return(rslt);
            }
Пример #2
0
        //Override for GetResultantText
        public static String GetResultantText(this LocationTextExtractionStrategy strategy, Rectangle rect)
        {
            //Get chunks of text from extraction strategy
            IList <TextChunk> locationalResult = (IList <TextChunk>)locationalResultField.GetValue(strategy);
            //Make container list to store chunks that do not match
            List <TextChunk> nonMatching = new List <TextChunk>();

            //For reach chunk in extraction strategy
            foreach (TextChunk chunk in locationalResult)
            {
                //Get chunk location
                ITextChunkLocation location = chunk.GetLocation();
                //Make start and end vectors
                Vector start = location.GetStartLocation();
                Vector end   = location.GetEndLocation();
                //check if asked rectangle is NOT intersecting current chunk of text
                if (!rect.IntersectsLine(start.Get(Vector.I1), start.Get(Vector.I2), end.Get(Vector.I1), end.Get(Vector.I2)))
                {
                    //if rectangle is not containing this chunk add to nonMatching
                    nonMatching.Add(chunk);
                }
            }
            //For each element in nonMatching remove from locationalResult collection
            nonMatching.ForEach(c => locationalResult.Remove(c));
            try
            {
                //Try returning value if something remain
                return(strategy.GetResultantText());
            }
            finally
            {
                //Return everything from non matching to localResult if there is no return value.
                nonMatching.ForEach(c => locationalResult.Add(c));
            }
        }
Пример #3
0
        public static String GetResultantText(this LocationTextExtractionStrategy strategy, Rectangle rect)
        {
            IList <TextChunk> locationalResult = (IList <TextChunk>)locationalResultField.GetValue(strategy);
            List <TextChunk>  nonMatching      = new List <TextChunk>();

            foreach (TextChunk chunk in locationalResult)
            {
                ITextChunkLocation location = chunk.GetLocation();
                Vector             start    = location.GetStartLocation();
                Vector             end      = location.GetEndLocation();
                if (!rect.IntersectsLine(start.Get(Vector.I1), start.Get(Vector.I2), end.Get(Vector.I1), end.Get(Vector.I2)))
                {
                    nonMatching.Add(chunk);
                }
            }
            nonMatching.ForEach(c => locationalResult.Remove(c));
            try
            {
                return(strategy.GetResultantText());
            }
            finally
            {
                nonMatching.ForEach(c => locationalResult.Add(c));
            }
        }
Пример #4
0
        protected override bool IsChunkAtWordBoundary(TextChunk chunk, TextChunk previousChunk)
        {
            ITextChunkLocation curLoc  = chunk.GetLocation();
            ITextChunkLocation prevLoc = previousChunk.GetLocation();

            if (curLoc.GetStartLocation().Equals(curLoc.GetEndLocation()) || prevLoc.GetEndLocation().Equals(prevLoc.GetStartLocation
                                                                                                                 ()))
            {
                return(false);
            }
            return(curLoc.DistParallelEnd() - prevLoc.DistParallelStart() > (curLoc.GetCharSpaceWidth() + prevLoc.GetCharSpaceWidth
                                                                                 ()) / 2.0f);
        }
Пример #5
0
            public virtual bool IsAtWordBoundary(ITextChunkLocation previous)
            {
                float dist = DistanceFromEndOf(previous);

                if (dist < 0)
                {
                    dist = previous.DistanceFromEndOf(this);

                    //The situation when the chunks intersect. We don't need to add space in this case
                    if (dist < 0)
                    {
                        return(false);
                    }
                }
                return(dist > CharSpaceWidth / 2.0f);
            }
    public virtual bool SameLine(ITextChunkLocation @as)
    {
        if (OrientationMagnitude() != @as.OrientationMagnitude())
        {
            return(false);
        }
        int distPerpendicularDiff = DistPerpendicular() - @as.DistPerpendicular();

        if (Math.Abs(distPerpendicularDiff) < 2)
        {
            return(true);
        }
        LineSegment mySegment    = new LineSegment(startLocation, endLocation);
        LineSegment otherSegment = new LineSegment(@as.GetStartLocation(), @as.GetEndLocation());

        return(Math.Abs(distPerpendicularDiff) <= DIACRITICAL_MARKS_ALLOWED_VERTICAL_DEVIATION && (mySegment.GetLength() == 0 || otherSegment.GetLength() == 0));
    }
    public virtual bool IsAtWordBoundary(ITextChunkLocation previous)
    {
        if (startLocation.Equals(endLocation) || previous.GetEndLocation().Equals(previous.GetStartLocation()))
        {
            return(false);
        }
        float dist = DistanceFromEndOf(previous);

        if (dist < 0)
        {
            dist = previous.DistanceFromEndOf(this);
            //The situation when the chunks intersect. We don't need to add space in this case
            if (dist < 0)
            {
                return(false);
            }
        }
        return(dist > GetCharSpaceWidth() / 2.0f);
    }
Пример #8
0
            public virtual bool IsAtWordBoundary(ITextChunkLocation previous)
            {
                /**
                 * Here we handle a very specific case which in PDF may look like:
                 * -.232 Tc [( P)-226.2(r)-231.8(e)-230.8(f)-238(a)-238.9(c)-228.9(e)]TJ
                 * The font's charSpace width is 0.232 and it's compensated with charSpacing of 0.232.
                 * And a resultant TextChunk.charSpaceWidth comes to TextChunk constructor as 0.
                 * In this case every chunk is considered as a word boundary and space is added.
                 * We should consider charSpaceWidth equal (or close) to zero as a no-space.
                 */
                if (CharSpaceWidth < 0.1f)
                {
                    return(false);
                }

                float dist = DistanceFromEndOf(previous);

                return(dist < -CharSpaceWidth || dist > CharSpaceWidth / 2.0f);
            }
        public virtual bool IsAtWordBoundary(ITextChunkLocation previous)
        {
            // In case a text chunk is of zero length, this probably means this is a mark character,
            // and we do not actually want to insert a space in such case
            if (startLocation.Equals(endLocation) || previous.GetEndLocation().Equals(previous.GetStartLocation()))
            {
                return(false);
            }
            float dist = DistanceFromEndOf(previous);

            if (dist < 0)
            {
                dist = previous.DistanceFromEndOf(this);
                //The situation when the chunks intersect. We don't need to add space in this case
                if (dist < 0)
                {
                    return(false);
                }
            }
            return(dist > GetCharSpaceWidth() / 2.0f);
        }
        public virtual bool IsAtWordBoundary(ITextChunkLocation previous)
        {
            /*
             * Here we handle a very specific case which in PDF may look like:
             * -.232 Tc [( P)-226.2(r)-231.8(e)-230.8(f)-238(a)-238.9(c)-228.9(e)]TJ
             * The font's charSpace width is 0.232 and it's compensated with charSpacing of 0.232.
             * And a resultant TextChunk.charSpaceWidth comes to TextChunk constructor as 0.
             * In this case every chunk is considered as a word boundary and space is added.
             * We should consider charSpaceWidth equal (or close) to zero as a no-space.
             */
            if (GetCharSpaceWidth() < 0.1f)
            {
                return(false);
            }
            // In case a text chunk is of zero length, this probably means this is a mark character,
            // and we do not actually want to insert a space in such case
            if (startLocation.Equals(endLocation) || previous.GetEndLocation().Equals(previous.GetStartLocation()))
            {
                return(false);
            }
            float dist = DistanceFromEndOf(previous);

            return(dist < -GetCharSpaceWidth() || dist > GetCharSpaceWidth() / 2.0f);
        }
Пример #11
0
 public TextChunk(String str, ITextChunkLocation location)
 {
     this.text     = str;
     this.location = location;
 }
Пример #12
0
            public virtual float DistanceFromEndOf(ITextChunkLocation other)
            {
                float distance = DistParallelStart - other.DistParallelEnd;

                return(distance);
            }
Пример #13
0
 public virtual bool SameLine(ITextChunkLocation other)
 {
     return(OrientationMagnitude == other.OrientationMagnitude &&
            DistPerpendicular == other.DistPerpendicular);
 }
 internal static bool ContainsMark(ITextChunkLocation baseLocation, ITextChunkLocation markLocation)
 {
     return(baseLocation.GetStartLocation().Get(Vector.I1) <= markLocation.GetStartLocation().Get(Vector.I1) &&
            baseLocation.GetEndLocation().Get(Vector.I1) >= markLocation.GetEndLocation().Get(Vector.I1) && Math.
            Abs(baseLocation.DistPerpendicular() - markLocation.DistPerpendicular()) <= DIACRITICAL_MARKS_ALLOWED_VERTICAL_DEVIATION);
 }
 /// <summary>
 /// Computes the distance between the end of 'other' and the beginning of this chunk
 /// in the direction of this chunk's orientation vector.
 /// </summary>
 /// <remarks>
 /// Computes the distance between the end of 'other' and the beginning of this chunk
 /// in the direction of this chunk's orientation vector.  Note that it's a bad idea
 /// to call this for chunks that aren't on the same line and orientation, but we don't
 /// explicitly check for that condition for performance reasons.
 /// </remarks>
 /// <param name="other"/>
 /// <returns>the number of spaces between the end of 'other' and the beginning of this chunk</returns>
 public virtual float DistanceFromEndOf(ITextChunkLocation other)
 {
     return(DistParallelStart() - other.DistParallelEnd());
 }
Пример #16
0
        private void SortWithMarks(IList <TextChunk> textChunks)
        {
            IDictionary <TextChunk, LocationTextExtractionStrategy.TextChunkMarks> marks = new Dictionary <TextChunk, LocationTextExtractionStrategy.TextChunkMarks
                                                                                                           >();
            IList <TextChunk> toSort = new List <TextChunk>();

            for (int markInd = 0; markInd < textChunks.Count; markInd++)
            {
                ITextChunkLocation location = textChunks[markInd].GetLocation();
                if (location.GetStartLocation().Equals(location.GetEndLocation()))
                {
                    bool foundBaseToAttachTo = false;
                    for (int baseInd = 0; baseInd < textChunks.Count; baseInd++)
                    {
                        if (markInd != baseInd)
                        {
                            ITextChunkLocation baseLocation = textChunks[baseInd].GetLocation();
                            if (!baseLocation.GetStartLocation().Equals(baseLocation.GetEndLocation()) && TextChunkLocationDefaultImp.
                                ContainsMark(baseLocation, location))
                            {
                                LocationTextExtractionStrategy.TextChunkMarks currentMarks = marks.Get(textChunks[baseInd]);
                                if (currentMarks == null)
                                {
                                    currentMarks = new LocationTextExtractionStrategy.TextChunkMarks();
                                    marks.Put(textChunks[baseInd], currentMarks);
                                }
                                if (markInd < baseInd)
                                {
                                    currentMarks.preceding.Add(textChunks[markInd]);
                                }
                                else
                                {
                                    currentMarks.succeeding.Add(textChunks[markInd]);
                                }
                                foundBaseToAttachTo = true;
                                break;
                            }
                        }
                    }
                    if (!foundBaseToAttachTo)
                    {
                        toSort.Add(textChunks[markInd]);
                    }
                }
                else
                {
                    toSort.Add(textChunks[markInd]);
                }
            }
            JavaCollectionsUtil.Sort(toSort, new TextChunkLocationBasedComparator(new DefaultTextChunkLocationComparator
                                                                                      (!rightToLeftRunDirection)));
            textChunks.Clear();
            foreach (TextChunk current in toSort)
            {
                LocationTextExtractionStrategy.TextChunkMarks currentMarks = marks.Get(current);
                if (currentMarks != null)
                {
                    if (!rightToLeftRunDirection)
                    {
                        for (int j = 0; j < currentMarks.preceding.Count; j++)
                        {
                            textChunks.Add(currentMarks.preceding[j]);
                        }
                    }
                    else
                    {
                        for (int j = currentMarks.succeeding.Count - 1; j >= 0; j--)
                        {
                            textChunks.Add(currentMarks.succeeding[j]);
                        }
                    }
                }
                textChunks.Add(current);
                if (currentMarks != null)
                {
                    if (!rightToLeftRunDirection)
                    {
                        for (int j = 0; j < currentMarks.succeeding.Count; j++)
                        {
                            textChunks.Add(currentMarks.succeeding[j]);
                        }
                    }
                    else
                    {
                        for (int j = currentMarks.preceding.Count - 1; j >= 0; j--)
                        {
                            textChunks.Add(currentMarks.preceding[j]);
                        }
                    }
                }
            }
        }
Пример #17
0
 public TextChunk(String @string, ITextChunkLocation loc)
 {
     this.text     = @string;
     this.location = loc;
 }