internal Line(Document document, int LineNo, string Text, HorizontalAlignment align, Font font, Color color, Color back_color, TextPositioning text_position, float char_offset, float left_indent, float hanging_indent, float right_indent, float spacing_before, float spacing_after, float line_spacing, bool line_spacing_multiple, TabStopCollection tab_stops, bool visible, LineEnding ending) : this(document, ending) { space = Text.Length > DEFAULT_TEXT_LEN ? Text.Length + 1 : DEFAULT_TEXT_LEN; text = new StringBuilder(Text, space); line_no = LineNo; this.ending = ending; alignment = align; indent = left_indent; HangingIndent = hanging_indent; this.right_indent = right_indent; this.spacing_before = spacing_before; this.spacing_after = spacing_after; this.tab_stops = tab_stops; this.line_spacing = line_spacing; this.line_spacing_multiple = line_spacing_multiple; widths = new float[space + 1]; tags = new LineTag(this, 1); tags.Font = font; tags.Color = color; tags.BackColor = back_color; tags.TextPosition = text_position; tags.CharOffset = char_offset; tags.Visible = visible; }
/// <summary> /// Creates a new /// <see cref="Tesseract4OcrEngineProperties"/> /// instance /// based on another /// <see cref="Tesseract4OcrEngineProperties"/> /// instance (copy /// constructor). /// </summary> /// <param name="other"> /// the other /// <see cref="Tesseract4OcrEngineProperties"/> /// instance /// </param> public Tesseract4OcrEngineProperties(iText.Pdfocr.Tesseract4.Tesseract4OcrEngineProperties other) : base(other) { this.tessDataDir = other.tessDataDir; this.pageSegMode = other.pageSegMode; this.preprocessingImages = other.preprocessingImages; this.textPositioning = other.textPositioning; this.pathToUserWordsFile = other.pathToUserWordsFile; }
/// <summary> /// Parses each hocr file from the provided list, retrieves text, and /// returns data in the format described below. /// </summary> /// <param name="inputFiles">list of input files</param> /// <param name="textPositioning"> /// /// <see cref="TextPositioning"/> /// </param> /// <returns> /// /// <see cref="System.Collections.IDictionary{K, V}"/> /// where key is /// <see cref="int?"/> /// representing the number of the page and value is /// <see cref="System.Collections.IList{E}"/> /// of /// <see cref="iText.Pdfocr.TextInfo"/> /// elements where each /// <see cref="iText.Pdfocr.TextInfo"/> /// element contains a word or a line and its 4 /// coordinates(bbox) /// </returns> public static IDictionary <int, IList <TextInfo> > ParseHocrFile(IList <FileInfo> inputFiles, TextPositioning textPositioning) { IDictionary <int, IList <TextInfo> > imageData = new LinkedDictionary <int, IList <TextInfo> >(); IDictionary <String, iText.StyledXmlParser.Jsoup.Nodes.Node> unparsedBBoxes = new LinkedDictionary <String, iText.StyledXmlParser.Jsoup.Nodes.Node>(); foreach (FileInfo inputFile in inputFiles) { if (inputFile != null && File.Exists(System.IO.Path.Combine(inputFile.FullName))) { FileStream fileInputStream = new FileStream(inputFile.FullName, FileMode.Open, FileAccess.Read); Document doc = iText.StyledXmlParser.Jsoup.Jsoup.Parse(fileInputStream, System.Text.Encoding.UTF8.Name(), inputFile.FullName); Elements pages = doc.GetElementsByClass("ocr_page"); IList <String> searchedClasses = TextPositioning.BY_LINES.Equals(textPositioning) ? JavaUtil.ArraysAsList("ocr_line" , "ocr_caption") : JavaCollectionsUtil.SingletonList <String>("ocrx_word"); foreach (iText.StyledXmlParser.Jsoup.Nodes.Element page in pages) { String[] pageNum = iText.IO.Util.StringUtil.Split(page.Id(), "page_"); int pageNumber = Convert.ToInt32(pageNum[pageNum.Length - 1], System.Globalization.CultureInfo.InvariantCulture ); IList <TextInfo> textData = new List <TextInfo>(); if (searchedClasses.Count > 0) { Elements objects = page.GetElementsByClass(searchedClasses[0]); for (int i = 1; i < searchedClasses.Count; i++) { Elements foundElements = page.GetElementsByClass(searchedClasses[i]); for (int j = 0; j < foundElements.Count; j++) { objects.Add(foundElements[j]); } } foreach (iText.StyledXmlParser.Jsoup.Nodes.Element obj in objects) { IList <float> coordinates = GetAlignedBBox(obj, textPositioning, unparsedBBoxes); textData.Add(new TextInfo(obj.Text(), coordinates)); } } if (textData.Count > 0) { if (imageData.ContainsKey(pageNumber)) { pageNumber = Enumerable.Max(imageData.Keys) + 1; } imageData.Put(pageNumber, textData); } } fileInputStream.Dispose(); } } foreach (iText.StyledXmlParser.Jsoup.Nodes.Node node in unparsedBBoxes.Values) { LOGGER.Warn(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_PARSE_NODE_BBOX, node.ToString()) ); } return(imageData); }
/// <summary>Get and align (if needed) bbox of the element.</summary> internal static IList <float> GetAlignedBBox(iText.StyledXmlParser.Jsoup.Nodes.Element @object, TextPositioning textPositioning, IDictionary <String, iText.StyledXmlParser.Jsoup.Nodes.Node> unparsedBBoxes) { IList <float> coordinates = ParseBBox(@object, unparsedBBoxes); if (TextPositioning.BY_WORDS_AND_LINES == textPositioning || TextPositioning.BY_WORDS == textPositioning) { iText.StyledXmlParser.Jsoup.Nodes.Node line = @object.Parent(); IList <float> lineCoordinates = ParseBBox(line, unparsedBBoxes); if (TextPositioning.BY_WORDS_AND_LINES == textPositioning) { coordinates[BOTTOM_IDX] = lineCoordinates[BOTTOM_IDX]; coordinates[TOP_IDX] = lineCoordinates[TOP_IDX]; } DetectAndFixBrokenBBoxes(@object, coordinates, lineCoordinates, unparsedBBoxes); } return(coordinates); }
/// <summary> /// Defines the way text is retrieved from tesseract output /// using /// <see cref="TextPositioning"/>. /// </summary> /// <param name="positioning">the way text is retrieved</param> /// <returns> /// the /// <see cref="Tesseract4OcrEngineProperties"/> /// instance /// </returns> public iText.Pdfocr.Tesseract4.Tesseract4OcrEngineProperties SetTextPositioning(TextPositioning positioning ) { textPositioning = positioning; return(this); }
private static void SetFormat(LineTag tag, Font font, Color color, Color back_color, TextPositioning text_position, float char_offset, bool visible, FormatSpecified specified) { if ((FormatSpecified.Font & specified) == FormatSpecified.Font) { tag.Font = font; } if ((FormatSpecified.Color & specified) == FormatSpecified.Color) { tag.color = color; } if ((FormatSpecified.BackColor & specified) == FormatSpecified.BackColor) { tag.back_color = back_color; } if ((FormatSpecified.TextPosition & specified) == FormatSpecified.TextPosition) { tag.TextPosition = text_position; } if ((FormatSpecified.CharOffset & specified) == FormatSpecified.CharOffset) { tag.CharOffset = char_offset; } if ((FormatSpecified.Visibility & specified) == FormatSpecified.Visibility) { tag.Visible = visible; } // Console.WriteLine ("setting format: {0} {1} new color {2}", color.Color, specified, tag.color.Color); }
/// <summary>Applies 'font' and 'brush' to characters starting at 'start' for 'length' chars; /// Removes any previous tags overlapping the same area; /// returns true if lineheight has changed</summary> /// <param name="formatStart">1-based character position on line</param> public static bool FormatText(Line line, int formatStart, int length, Font font, Color color, Color backColor, TextPositioning text_position, float char_offset, bool visible, FormatSpecified specified) { LineTag tag; LineTag start_tag; LineTag end_tag; int end; bool retval = false; // Assume line-height doesn't change // Too simple? if (((FormatSpecified.Font & specified) == FormatSpecified.Font) && font.Height != line.TextHeight) { retval = true; } line.recalc = true; // This forces recalculation of the line in RecalculateDocument // A little sanity, not sure if it's needed, might be able to remove for speed if (length > line.text.Length) { length = line.text.Length; } tag = line.tags; end = formatStart + length; // Common special case if ((formatStart == 1) && (length == tag.Length)) { SetFormat(tag, font, color, backColor, text_position, char_offset, visible, specified); return(retval); } // empty selection style at begining of line means // we only need one new tag if (formatStart == 1 && length == 0) { line.tags.Break(1); SetFormat(line.tags, font, color, backColor, text_position, char_offset, visible, specified); return(retval); } start_tag = FindTag(line, formatStart - 1); // we are at an empty tag already! // e.g. [Tag 0 - "He"][Tag 1 = 0 length][Tag 2 "llo world"] // Find Tag will return tag 0 at position 3, but we should just // use the empty tag after.. if (start_tag.End == formatStart && length == 0 && start_tag.Next != null && start_tag.Next.Length == 0) { SetFormat(start_tag.Next, font, color, backColor, text_position, char_offset, visible, specified); return(retval); } // if we are at the end of a tag, we want to move to the next tag while (start_tag.End == formatStart && start_tag.Next != null) { start_tag = start_tag.Next; } if (start_tag.Start == formatStart && start_tag.Length == length) { SetFormat(start_tag, font, color, backColor, text_position, char_offset, visible, specified); return(retval); } // Break the tag if needed -- we don't need to break for the start if we're starting at its start. if (start_tag.Start != formatStart) { tag = start_tag.Break(formatStart); } else { tag = start_tag; } // empty selection style at end of line - its the only situation // where the rest of the tag would be empty, since we moved to the // begining of next non empty tag if (tag.Length == 0) { SetFormat(tag, font, color, backColor, text_position, char_offset, visible, specified); return(retval); } // empty - so we just create another tag for // after our new (now) empty one.. if (length == 0) { tag.Break(formatStart); SetFormat(tag, font, color, backColor, text_position, char_offset, visible, specified); return(retval); } bool atEnd = false; while (tag != null && tag.End <= end) { SetFormat(tag, font, color, backColor, text_position, char_offset, visible, specified); atEnd |= tag.End == end; tag = tag.next; } // did the last tag conveniently fit? if (atEnd || (tag != null && tag.End == end)) { return(retval); } // Now do the last tag end_tag = FindTag(line, end - 1); if (end_tag != null) { end_tag.Break(end); SetFormat(end_tag, font, color, backColor, text_position, char_offset, visible, specified); } return(retval); }