示例#1
0
 private static double GetExpectedWhitespaceSize(Letter letter)
 {
     if (letter.Value == " ")
     {
         return(letter.Width);
     }
     return(WhitespaceSizeStatistics.GetExpectedWhitespaceSize(letter));
 }
示例#2
0
        /// <summary>
        /// Gets a human readable representation of the text from the page based on
        /// the letter order of the original PDF document.
        /// </summary>
        /// <param name="page">A page from the document.</param>
        /// <param name="options">Control various aspects of the generated text.</param>
        public static string GetText(Page page, Options options)
        {
            options ??= new Options();

            var sb = new StringBuilder();

            var previous = default(Letter);
            var hasJustAddedWhitespace = false;

            for (var i = 0; i < page.Letters.Count; i++)
            {
                var letter = page.Letters[i];

                if (string.IsNullOrEmpty(letter.Value))
                {
                    continue;
                }

                if (options.ReplaceWhitespaceWithSpace && ReplaceableWhitespace.Contains(letter.Value))
                {
                    letter = new Letter(
                        " ",
                        letter.GlyphRectangle,
                        letter.StartBaseLine,
                        letter.EndBaseLine,
                        letter.Width,
                        letter.FontSize,
                        letter.Font,
                        letter.Color,
                        letter.PointSize,
                        letter.TextSequence);
                }

                if (letter.Value == " " && !hasJustAddedWhitespace)
                {
                    if (previous != null && IsNewline(previous, letter, page, out _))
                    {
                        continue;
                    }

                    sb.Append(" ");
                    previous = letter;
                    hasJustAddedWhitespace = true;
                    continue;
                }

                hasJustAddedWhitespace = false;

                if (previous != null && letter.Value != " ")
                {
                    var nwPrevious = GetNonWhitespacePrevious(page, i);

                    if (IsNewline(nwPrevious, letter, page, out var isDoubleNewline))
                    {
                        if (previous.Value == " ")
                        {
                            sb.Remove(sb.Length - 1, 1);
                        }

                        sb.AppendLine();
                        if (options.SeparateParagraphsWithDoubleNewline && isDoubleNewline)
                        {
                            sb.AppendLine();
                        }

                        hasJustAddedWhitespace = true;
                    }
                    else if (previous.Value != " ")
                    {
                        var gap = letter.StartBaseLine.X - previous.EndBaseLine.X;

                        if (WhitespaceSizeStatistics.IsProbablyWhitespace(gap, previous))
                        {
                            sb.Append(" ");
                            hasJustAddedWhitespace = true;
                        }
                    }
                }

                sb.Append(letter.Value);
                previous = letter;
            }

            return(sb.ToString());
        }
        /// <summary>
        /// Gets a human readable representation of the text from the page based on
        /// the letter order of the original PDF document.
        /// </summary>
        /// <param name="page">A page from the document.</param>
        /// <param name="addDoubleNewline">Whether to include a double new-line when the text is likely to be a new paragraph.</param>
        public static string GetText(Page page, bool addDoubleNewline = false)
        {
            var sb = new StringBuilder();

            var previous = default(Letter);
            var hasJustAddedWhitespace = false;

            for (var i = 0; i < page.Letters.Count; i++)
            {
                var letter = page.Letters[i];

                if (string.IsNullOrEmpty(letter.Value))
                {
                    continue;
                }

                if (letter.Value == " " && !hasJustAddedWhitespace)
                {
                    if (previous != null && IsNewline(previous, letter, page, out _))
                    {
                        continue;
                    }

                    sb.Append(" ");
                    previous = letter;
                    hasJustAddedWhitespace = true;
                    continue;
                }

                hasJustAddedWhitespace = false;

                if (previous != null && letter.Value != " ")
                {
                    var nwPrevious = GetNonWhitespacePrevious(page, i);

                    if (IsNewline(nwPrevious, letter, page, out var isDoubleNewline))
                    {
                        if (previous.Value == " ")
                        {
                            sb.Remove(sb.Length - 1, 1);
                        }

                        sb.AppendLine();
                        if (addDoubleNewline && isDoubleNewline)
                        {
                            sb.AppendLine();
                        }

                        hasJustAddedWhitespace = true;
                    }
                    else if (previous.Value != " ")
                    {
                        var gap = letter.StartBaseLine.X - previous.EndBaseLine.X;

                        if (WhitespaceSizeStatistics.IsProbablyWhitespace(gap, previous))
                        {
                            sb.Append(" ");
                            hasJustAddedWhitespace = true;
                        }
                    }
                }

                sb.Append(letter.Value);
                previous = letter;
            }

            return(sb.ToString());
        }