Esempio n. 1
0
        /**
         * Filters the provided list with the provided filter
         * @param textChunks a list of all TextChunks that this strategy found during processing
         * @param filter the filter to apply.  If null, filtering will be skipped.
         * @return the filtered list
         * @since 5.3.3
         */
        private List <TextChunk> filterTextChunks(List <TextChunk> textChunks, ITextChunkFilter filter)
        {
            if (filter == null)
            {
                return(textChunks);
            }

            List <TextChunk> filtered = new List <TextChunk>();

            foreach (TextChunk textChunk in textChunks)
            {
                if (filter.Accept(textChunk))
                {
                    filtered.Add(textChunk);
                }
            }

            return(filtered);
        }
Esempio n. 2
0
        /**
         * Gets text that meets the specified filter
         * If multiple text extractions will be performed for the same page (i.e. for different physical regions of the page),
         * filtering at this level is more efficient than filtering using {@link FilteredRenderListener} - but not nearly as powerful
         * because most of the RenderInfo state is not captured in {@link TextChunk}
         * @param chunkFilter the filter to to apply
         * @return the text results so far, filtered using the specified filter
         */
        public virtual String GetResultantText(ITextChunkFilter chunkFilter)
        {
            if (DUMP_STATE)
            {
                DumpState();
            }

            List <TextChunk> filteredTextChunks = filterTextChunks(locationalResult, chunkFilter);

            filteredTextChunks.Sort();

            StringBuilder sb        = new StringBuilder();
            TextChunk     lastChunk = null;

            foreach (TextChunk chunk in filteredTextChunks)
            {
                if (lastChunk == null)
                {
                    sb.Append(chunk.Text);
                }
                else
                {
                    if (chunk.SameLine(lastChunk))
                    {
                        // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space
                        if (IsChunkAtWordBoundary(chunk, lastChunk) && !StartsWithSpace(chunk.Text) && !EndsWithSpace(lastChunk.Text))
                        {
                            sb.Append(' ');
                        }

                        sb.Append(chunk.Text);
                    }
                    else
                    {
                        sb.Append('\n');
                        sb.Append(chunk.Text);
                    }
                }
                lastChunk = chunk;
            }

            return(sb.ToString());
        }
Esempio n. 3
0
        /**
         * Gets text that meets the specified filter
         * If multiple text extractions will be performed for the same page (i.e. for different physical regions of the page),
         * filtering at this level is more efficient than filtering using {@link FilteredRenderListener} - but not nearly as powerful
         * because most of the RenderInfo state is not captured in {@link TextChunk}
         *
         * @param chunkFilter the filter to to apply
         * @return the text results so far, filtered using the specified filter
         *
         * edit to collect terms
         */
        public virtual String GetResultantText(ITextChunkFilter chunkFilter)
        {
            List <TextChunk> filteredTextChunks = filterTextChunks(locationalResult, chunkFilter);

            filteredTextChunks.Sort();

            List <TextWithRect> tmpList = new List <TextWithRect>();

            StringBuilder sb        = new StringBuilder();
            TextChunk     lastChunk = null;

            foreach (TextChunk chunk in filteredTextChunks)
            {
                if (chunk.Text.Equals(" "))
                {
                    continue;
                }

                if (lastChunk == null)
                {
                    sb.Append(chunk.Text);
                    var rect = chunk.Rectangle;
                    tmpList.Add(new TextWithRect {
                        Rect = rect, Text = chunk.Text
                    });
                }
                else
                {
                    if (chunk.SameLine(lastChunk))
                    {
                        // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space
                        if (IsChunkAtWordBoundary(chunk, lastChunk) && !StartsWithSpace(chunk.Text) && !EndsWithSpace(lastChunk.Text))
                        {
                            sb.Append(' ');
                            if (tmpList.Count > 0)
                            {
                                mergeAndStoreChunk(tmpList);
                                tmpList.Clear();
                            }
                        }

                        sb.Append(chunk.Text);

                        var rect = chunk.Rectangle;
                        tmpList.Add(new TextWithRect {
                            Rect = rect, Text = chunk.Text
                        });
                    }
                    else
                    {
                        sb.Append('\n');
                        if (tmpList.Count > 0)
                        {
                            mergeAndStoreChunk(tmpList);
                            tmpList.Clear();
                        }
                        sb.Append(chunk.Text);
                        var rect = chunk.Rectangle;
                        tmpList.Add(new TextWithRect {
                            Rect = rect, Text = chunk.Text
                        });
                    }
                }
                lastChunk = chunk;
            }

            matchTopicTerms();
            return(sb.ToString());
        }