/** * <summary>Sorts the extracted text strings.</summary> * <remarks>Sorting implies text position ordering, integration and aggregation.</remarks> * <param name="rawTextStrings">Source (lower-level) text strings.</param> * <param name="textStrings">Target (higher-level) text strings.</param> */ private void Sort( List <ContentScanner.TextStringWrapper> rawTextStrings, List <ITextString> textStrings ) { // Sorting the source text strings... { TextStringPositionComparer <ContentScanner.TextStringWrapper> positionComparator = new TextStringPositionComparer <ContentScanner.TextStringWrapper>(); rawTextStrings.Sort(positionComparator); } // Aggregating and integrating the source text strings into the target ones... TextString textString = null; TextStyle textStyle = null; TextChar previousTextChar = null; bool dehyphenating = false; foreach (ContentScanner.TextStringWrapper rawTextString in rawTextStrings) { /* * NOTE: Contents on the same line are grouped together within the same text string. */ // Add a new text string in case of new line! if (textString != null && textString.TextChars.Count > 0 && !TextStringPositionComparer <ITextString> .IsOnTheSameLine(textString.Box.Value, rawTextString.Box.Value)) { if (dehyphenated && previousTextChar.Value == '-') // Hyphened word. { textString.TextChars.Remove(previousTextChar); dehyphenating = true; } else // Full word. { // Add synthesized space character! textString.TextChars.Add( new TextChar( ' ', new RectangleF( previousTextChar.Box.Right, previousTextChar.Box.Top, 0, previousTextChar.Box.Height ), textStyle, true ) ); textString = null; dehyphenating = false; } previousTextChar = null; } if (textString == null) { textStrings.Add(textString = new TextString()); } textStyle = rawTextString.Style; double spaceWidth = textStyle.GetWidth(' ') * .5; foreach (TextChar textChar in rawTextString.TextChars) { if (previousTextChar != null) { /* * NOTE: PDF files may have text contents omitting space characters, * so they must be inferred and synthesized, marking them as virtual * in order to allow the user to distinguish between original contents * and augmented ones. */ if (!textChar.Contains(' ') && !previousTextChar.Contains(' ')) { float charSpace = textChar.Box.X - previousTextChar.Box.Right; if (charSpace > spaceWidth) { // Add synthesized space character! textString.TextChars.Add( previousTextChar = new TextChar( ' ', new RectangleF( previousTextChar.Box.Right, textChar.Box.Y, charSpace, textChar.Box.Height ), textStyle, true ) ); } } else if (dehyphenating && previousTextChar.Contains(' ')) { textStrings.Add(textString = new TextString()); dehyphenating = false; } } textString.TextChars.Add(previousTextChar = textChar); } } }
/** * <summary>Sorts the extracted text strings.</summary> * <remarks>Sorting implies text position ordering, integration and aggregation.</remarks> * <param name="rawTextStrings">Source (lower-level) text strings.</param> * <param name="textStrings">Target (higher-level) text strings.</param> */ private void Sort( List <ContentScanner.TextStringWrapper> rawTextStrings, List <ITextString> textStrings ) { // Sorting the source text strings... { TextStringPositionComparer <ContentScanner.TextStringWrapper> positionComparator = new TextStringPositionComparer <ContentScanner.TextStringWrapper>(); rawTextStrings.Sort(positionComparator); } // Aggregating and integrating the source text strings into the target ones... TextString textString = null; TextChar previousTextChar = null; foreach (ContentScanner.TextStringWrapper rawTextString in rawTextStrings) { /* * NOTE: Contents on the same line are grouped together within the same text string. */ // Add a new text string in case of new line! if (textString == null || (textString.TextChars.Count > 0 && !TextStringPositionComparer <ITextString> .IsOnTheSameLine( textString.Box.Value, rawTextString.Box.Value))) { textStrings.Add(textString = new TextString()); previousTextChar = null; } TextStyle textStyle = rawTextString.Style; float spaceWidth = 0; try { spaceWidth = textStyle.Font.GetWidth(' ', textStyle.FontSize); } catch { /* NOOP. */ } if (spaceWidth == 0) { spaceWidth = textStyle.FontSize * .25f; } // NOTE: as a rule of thumb, space width is estimated according to the font size. foreach (TextChar textChar in rawTextString.TextChars) { if (previousTextChar != null) { /* * NOTE: PDF files may have text contents omitting space characters, * so they must be inferred and synthesized, marking them as virtual * in order to allow the user to distinguish between original contents * and augmented ones. */ float characterSpace = textChar.Box.X - previousTextChar.Box.Right; if (characterSpace >= spaceWidth) { // Add synthesized space character! textString.TextChars.Add( new TextChar( ' ', new RectangleF( previousTextChar.Box.Right, textChar.Box.Y, characterSpace, textChar.Box.Height ), textStyle, true ) ); } } textString.TextChars.Add(previousTextChar = textChar); } } }