private static double GetExpectedWhitespaceSize(Letter letter) { if (letter.Value == " ") { return(letter.Width); } return(WhitespaceSizeStatistics.GetExpectedWhitespaceSize(letter)); }
/// <summary> /// Gets a human readable representation of the text from the page based on /// the letter order of the original PDF document. /// </summary> /// <param name="page">A page from the document.</param> /// <param name="options">Control various aspects of the generated text.</param> public static string GetText(Page page, Options options) { options ??= new Options(); var sb = new StringBuilder(); var previous = default(Letter); var hasJustAddedWhitespace = false; for (var i = 0; i < page.Letters.Count; i++) { var letter = page.Letters[i]; if (string.IsNullOrEmpty(letter.Value)) { continue; } if (options.ReplaceWhitespaceWithSpace && ReplaceableWhitespace.Contains(letter.Value)) { letter = new Letter( " ", letter.GlyphRectangle, letter.StartBaseLine, letter.EndBaseLine, letter.Width, letter.FontSize, letter.Font, letter.Color, letter.PointSize, letter.TextSequence); } if (letter.Value == " " && !hasJustAddedWhitespace) { if (previous != null && IsNewline(previous, letter, page, out _)) { continue; } sb.Append(" "); previous = letter; hasJustAddedWhitespace = true; continue; } hasJustAddedWhitespace = false; if (previous != null && letter.Value != " ") { var nwPrevious = GetNonWhitespacePrevious(page, i); if (IsNewline(nwPrevious, letter, page, out var isDoubleNewline)) { if (previous.Value == " ") { sb.Remove(sb.Length - 1, 1); } sb.AppendLine(); if (options.SeparateParagraphsWithDoubleNewline && isDoubleNewline) { sb.AppendLine(); } hasJustAddedWhitespace = true; } else if (previous.Value != " ") { var gap = letter.StartBaseLine.X - previous.EndBaseLine.X; if (WhitespaceSizeStatistics.IsProbablyWhitespace(gap, previous)) { sb.Append(" "); hasJustAddedWhitespace = true; } } } sb.Append(letter.Value); previous = letter; } return(sb.ToString()); }
/// <summary> /// Gets a human readable representation of the text from the page based on /// the letter order of the original PDF document. /// </summary> /// <param name="page">A page from the document.</param> /// <param name="addDoubleNewline">Whether to include a double new-line when the text is likely to be a new paragraph.</param> public static string GetText(Page page, bool addDoubleNewline = false) { var sb = new StringBuilder(); var previous = default(Letter); var hasJustAddedWhitespace = false; for (var i = 0; i < page.Letters.Count; i++) { var letter = page.Letters[i]; if (string.IsNullOrEmpty(letter.Value)) { continue; } if (letter.Value == " " && !hasJustAddedWhitespace) { if (previous != null && IsNewline(previous, letter, page, out _)) { continue; } sb.Append(" "); previous = letter; hasJustAddedWhitespace = true; continue; } hasJustAddedWhitespace = false; if (previous != null && letter.Value != " ") { var nwPrevious = GetNonWhitespacePrevious(page, i); if (IsNewline(nwPrevious, letter, page, out var isDoubleNewline)) { if (previous.Value == " ") { sb.Remove(sb.Length - 1, 1); } sb.AppendLine(); if (addDoubleNewline && isDoubleNewline) { sb.AppendLine(); } hasJustAddedWhitespace = true; } else if (previous.Value != " ") { var gap = letter.StartBaseLine.X - previous.EndBaseLine.X; if (WhitespaceSizeStatistics.IsProbablyWhitespace(gap, previous)) { sb.Append(" "); hasJustAddedWhitespace = true; } } } sb.Append(letter.Value); previous = letter; } return(sb.ToString()); }