/** * Filters the provided list with the provided filter * @param textChunks a list of all TextChunks that this strategy found during processing * @param filter the filter to apply. If null, filtering will be skipped. * @return the filtered list * @since 5.3.3 */ private List <TextChunk> filterTextChunks(List <TextChunk> textChunks, ITextChunkFilter filter) { if (filter == null) { return(textChunks); } List <TextChunk> filtered = new List <TextChunk>(); foreach (TextChunk textChunk in textChunks) { if (filter.Accept(textChunk)) { filtered.Add(textChunk); } } return(filtered); }
/** * Gets text that meets the specified filter * If multiple text extractions will be performed for the same page (i.e. for different physical regions of the page), * filtering at this level is more efficient than filtering using {@link FilteredRenderListener} - but not nearly as powerful * because most of the RenderInfo state is not captured in {@link TextChunk} * @param chunkFilter the filter to to apply * @return the text results so far, filtered using the specified filter */ public virtual String GetResultantText(ITextChunkFilter chunkFilter) { if (DUMP_STATE) { DumpState(); } List <TextChunk> filteredTextChunks = filterTextChunks(locationalResult, chunkFilter); filteredTextChunks.Sort(); StringBuilder sb = new StringBuilder(); TextChunk lastChunk = null; foreach (TextChunk chunk in filteredTextChunks) { if (lastChunk == null) { sb.Append(chunk.Text); } else { if (chunk.SameLine(lastChunk)) { // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space if (IsChunkAtWordBoundary(chunk, lastChunk) && !StartsWithSpace(chunk.Text) && !EndsWithSpace(lastChunk.Text)) { sb.Append(' '); } sb.Append(chunk.Text); } else { sb.Append('\n'); sb.Append(chunk.Text); } } lastChunk = chunk; } return(sb.ToString()); }
/** * Gets text that meets the specified filter * If multiple text extractions will be performed for the same page (i.e. for different physical regions of the page), * filtering at this level is more efficient than filtering using {@link FilteredRenderListener} - but not nearly as powerful * because most of the RenderInfo state is not captured in {@link TextChunk} * * @param chunkFilter the filter to to apply * @return the text results so far, filtered using the specified filter * * edit to collect terms */ public virtual String GetResultantText(ITextChunkFilter chunkFilter) { List <TextChunk> filteredTextChunks = filterTextChunks(locationalResult, chunkFilter); filteredTextChunks.Sort(); List <TextWithRect> tmpList = new List <TextWithRect>(); StringBuilder sb = new StringBuilder(); TextChunk lastChunk = null; foreach (TextChunk chunk in filteredTextChunks) { if (chunk.Text.Equals(" ")) { continue; } if (lastChunk == null) { sb.Append(chunk.Text); var rect = chunk.Rectangle; tmpList.Add(new TextWithRect { Rect = rect, Text = chunk.Text }); } else { if (chunk.SameLine(lastChunk)) { // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space if (IsChunkAtWordBoundary(chunk, lastChunk) && !StartsWithSpace(chunk.Text) && !EndsWithSpace(lastChunk.Text)) { sb.Append(' '); if (tmpList.Count > 0) { mergeAndStoreChunk(tmpList); tmpList.Clear(); } } sb.Append(chunk.Text); var rect = chunk.Rectangle; tmpList.Add(new TextWithRect { Rect = rect, Text = chunk.Text }); } else { sb.Append('\n'); if (tmpList.Count > 0) { mergeAndStoreChunk(tmpList); tmpList.Clear(); } sb.Append(chunk.Text); var rect = chunk.Rectangle; tmpList.Add(new TextWithRect { Rect = rect, Text = chunk.Text }); } } lastChunk = chunk; } matchTopicTerms(); return(sb.ToString()); }