コード例 #1
0
ファイル: Line.cs プロジェクト: SickheadGames/BRUTE.mono
        internal Line(Document document, int LineNo, string Text, HorizontalAlignment align, Font font, Color color,
                      Color back_color, TextPositioning text_position, float char_offset, float left_indent, float hanging_indent,
                      float right_indent, float spacing_before, float spacing_after, float line_spacing, bool line_spacing_multiple,
                      TabStopCollection tab_stops, bool visible, LineEnding ending) : this(document, ending)
        {
            space = Text.Length > DEFAULT_TEXT_LEN ? Text.Length + 1 : DEFAULT_TEXT_LEN;

            text                       = new StringBuilder(Text, space);
            line_no                    = LineNo;
            this.ending                = ending;
            alignment                  = align;
            indent                     = left_indent;
            HangingIndent              = hanging_indent;
            this.right_indent          = right_indent;
            this.spacing_before        = spacing_before;
            this.spacing_after         = spacing_after;
            this.tab_stops             = tab_stops;
            this.line_spacing          = line_spacing;
            this.line_spacing_multiple = line_spacing_multiple;

            widths = new float[space + 1];


            tags              = new LineTag(this, 1);
            tags.Font         = font;
            tags.Color        = color;
            tags.BackColor    = back_color;
            tags.TextPosition = text_position;
            tags.CharOffset   = char_offset;
            tags.Visible      = visible;
        }
コード例 #2
0
 /// <summary>
 /// Creates a new
 /// <see cref="Tesseract4OcrEngineProperties"/>
 /// instance
 /// based on another
 /// <see cref="Tesseract4OcrEngineProperties"/>
 /// instance (copy
 /// constructor).
 /// </summary>
 /// <param name="other">
 /// the other
 /// <see cref="Tesseract4OcrEngineProperties"/>
 /// instance
 /// </param>
 public Tesseract4OcrEngineProperties(iText.Pdfocr.Tesseract4.Tesseract4OcrEngineProperties other)
     : base(other)
 {
     this.tessDataDir         = other.tessDataDir;
     this.pageSegMode         = other.pageSegMode;
     this.preprocessingImages = other.preprocessingImages;
     this.textPositioning     = other.textPositioning;
     this.pathToUserWordsFile = other.pathToUserWordsFile;
 }
コード例 #3
0
        /// <summary>
        /// Parses each hocr file from the provided list, retrieves text, and
        /// returns data in the format described below.
        /// </summary>
        /// <param name="inputFiles">list of input files</param>
        /// <param name="textPositioning">
        ///
        /// <see cref="TextPositioning"/>
        /// </param>
        /// <returns>
        ///
        /// <see cref="System.Collections.IDictionary{K, V}"/>
        /// where key is
        /// <see cref="int?"/>
        /// representing the number of the page and value is
        /// <see cref="System.Collections.IList{E}"/>
        /// of
        /// <see cref="iText.Pdfocr.TextInfo"/>
        /// elements where each
        /// <see cref="iText.Pdfocr.TextInfo"/>
        /// element contains a word or a line and its 4
        /// coordinates(bbox)
        /// </returns>
        public static IDictionary <int, IList <TextInfo> > ParseHocrFile(IList <FileInfo> inputFiles, TextPositioning
                                                                         textPositioning)
        {
            IDictionary <int, IList <TextInfo> > imageData = new LinkedDictionary <int, IList <TextInfo> >();
            IDictionary <String, iText.StyledXmlParser.Jsoup.Nodes.Node> unparsedBBoxes = new LinkedDictionary <String,
                                                                                                                iText.StyledXmlParser.Jsoup.Nodes.Node>();

            foreach (FileInfo inputFile in inputFiles)
            {
                if (inputFile != null && File.Exists(System.IO.Path.Combine(inputFile.FullName)))
                {
                    FileStream fileInputStream = new FileStream(inputFile.FullName, FileMode.Open, FileAccess.Read);
                    Document   doc             = iText.StyledXmlParser.Jsoup.Jsoup.Parse(fileInputStream, System.Text.Encoding.UTF8.Name(),
                                                                                         inputFile.FullName);
                    Elements       pages           = doc.GetElementsByClass("ocr_page");
                    IList <String> searchedClasses = TextPositioning.BY_LINES.Equals(textPositioning) ? JavaUtil.ArraysAsList("ocr_line"
                                                                                                                              , "ocr_caption") : JavaCollectionsUtil.SingletonList <String>("ocrx_word");
                    foreach (iText.StyledXmlParser.Jsoup.Nodes.Element page in pages)
                    {
                        String[] pageNum    = iText.IO.Util.StringUtil.Split(page.Id(), "page_");
                        int      pageNumber = Convert.ToInt32(pageNum[pageNum.Length - 1], System.Globalization.CultureInfo.InvariantCulture
                                                              );
                        IList <TextInfo> textData = new List <TextInfo>();
                        if (searchedClasses.Count > 0)
                        {
                            Elements objects = page.GetElementsByClass(searchedClasses[0]);
                            for (int i = 1; i < searchedClasses.Count; i++)
                            {
                                Elements foundElements = page.GetElementsByClass(searchedClasses[i]);
                                for (int j = 0; j < foundElements.Count; j++)
                                {
                                    objects.Add(foundElements[j]);
                                }
                            }
                            foreach (iText.StyledXmlParser.Jsoup.Nodes.Element obj in objects)
                            {
                                IList <float> coordinates = GetAlignedBBox(obj, textPositioning, unparsedBBoxes);
                                textData.Add(new TextInfo(obj.Text(), coordinates));
                            }
                        }
                        if (textData.Count > 0)
                        {
                            if (imageData.ContainsKey(pageNumber))
                            {
                                pageNumber = Enumerable.Max(imageData.Keys) + 1;
                            }
                            imageData.Put(pageNumber, textData);
                        }
                    }
                    fileInputStream.Dispose();
                }
            }
            foreach (iText.StyledXmlParser.Jsoup.Nodes.Node node in unparsedBBoxes.Values)
            {
                LOGGER.Warn(MessageFormatUtil.Format(Tesseract4LogMessageConstant.CANNOT_PARSE_NODE_BBOX, node.ToString())
                            );
            }
            return(imageData);
        }
コード例 #4
0
        /// <summary>Get and align (if needed) bbox of the element.</summary>
        internal static IList <float> GetAlignedBBox(iText.StyledXmlParser.Jsoup.Nodes.Element @object, TextPositioning
                                                     textPositioning, IDictionary <String, iText.StyledXmlParser.Jsoup.Nodes.Node> unparsedBBoxes)
        {
            IList <float> coordinates = ParseBBox(@object, unparsedBBoxes);

            if (TextPositioning.BY_WORDS_AND_LINES == textPositioning || TextPositioning.BY_WORDS == textPositioning)
            {
                iText.StyledXmlParser.Jsoup.Nodes.Node line = @object.Parent();
                IList <float> lineCoordinates = ParseBBox(line, unparsedBBoxes);
                if (TextPositioning.BY_WORDS_AND_LINES == textPositioning)
                {
                    coordinates[BOTTOM_IDX] = lineCoordinates[BOTTOM_IDX];
                    coordinates[TOP_IDX]    = lineCoordinates[TOP_IDX];
                }
                DetectAndFixBrokenBBoxes(@object, coordinates, lineCoordinates, unparsedBBoxes);
            }
            return(coordinates);
        }
コード例 #5
0
 /// <summary>
 /// Defines the way text is retrieved from tesseract output
 /// using
 /// <see cref="TextPositioning"/>.
 /// </summary>
 /// <param name="positioning">the way text is retrieved</param>
 /// <returns>
 /// the
 /// <see cref="Tesseract4OcrEngineProperties"/>
 /// instance
 /// </returns>
 public iText.Pdfocr.Tesseract4.Tesseract4OcrEngineProperties SetTextPositioning(TextPositioning positioning
                                                                                 )
 {
     textPositioning = positioning;
     return(this);
 }
コード例 #6
0
ファイル: LineTag.cs プロジェクト: SickheadGames/BRUTE.mono
 private static void SetFormat(LineTag tag, Font font, Color color, Color back_color, TextPositioning text_position,
                               float char_offset, bool visible, FormatSpecified specified)
 {
     if ((FormatSpecified.Font & specified) == FormatSpecified.Font)
     {
         tag.Font = font;
     }
     if ((FormatSpecified.Color & specified) == FormatSpecified.Color)
     {
         tag.color = color;
     }
     if ((FormatSpecified.BackColor & specified) == FormatSpecified.BackColor)
     {
         tag.back_color = back_color;
     }
     if ((FormatSpecified.TextPosition & specified) == FormatSpecified.TextPosition)
     {
         tag.TextPosition = text_position;
     }
     if ((FormatSpecified.CharOffset & specified) == FormatSpecified.CharOffset)
     {
         tag.CharOffset = char_offset;
     }
     if ((FormatSpecified.Visibility & specified) == FormatSpecified.Visibility)
     {
         tag.Visible = visible;
     }
     // Console.WriteLine ("setting format:   {0}  {1}   new color {2}", color.Color, specified, tag.color.Color);
 }
コード例 #7
0
ファイル: LineTag.cs プロジェクト: SickheadGames/BRUTE.mono
        /// <summary>Applies 'font' and 'brush' to characters starting at 'start' for 'length' chars;
        /// Removes any previous tags overlapping the same area;
        /// returns true if lineheight has changed</summary>
        /// <param name="formatStart">1-based character position on line</param>
        public static bool FormatText(Line line, int formatStart, int length, Font font, Color color, Color backColor,
                                      TextPositioning text_position, float char_offset, bool visible, FormatSpecified specified)
        {
            LineTag tag;
            LineTag start_tag;
            LineTag end_tag;
            int     end;
            bool    retval = false;                     // Assume line-height doesn't change

            // Too simple?
            if (((FormatSpecified.Font & specified) == FormatSpecified.Font) && font.Height != line.TextHeight)
            {
                retval = true;
            }

            line.recalc = true;                         // This forces recalculation of the line in RecalculateDocument

            // A little sanity, not sure if it's needed, might be able to remove for speed
            if (length > line.text.Length)
            {
                length = line.text.Length;
            }

            tag = line.tags;
            end = formatStart + length;

            // Common special case
            if ((formatStart == 1) && (length == tag.Length))
            {
                SetFormat(tag, font, color, backColor, text_position, char_offset, visible, specified);
                return(retval);
            }

            // empty selection style at begining of line means
            // we only need one new tag
            if (formatStart == 1 && length == 0)
            {
                line.tags.Break(1);
                SetFormat(line.tags, font, color, backColor, text_position, char_offset, visible, specified);
                return(retval);
            }

            start_tag = FindTag(line, formatStart - 1);

            // we are at an empty tag already!
            // e.g. [Tag 0 - "He"][Tag 1 = 0 length][Tag 2 "llo world"]
            // Find Tag will return tag 0 at position 3, but we should just
            // use the empty tag after..
            if (start_tag.End == formatStart && length == 0 && start_tag.Next != null && start_tag.Next.Length == 0)
            {
                SetFormat(start_tag.Next, font, color, backColor, text_position, char_offset, visible, specified);
                return(retval);
            }

            // if we are at the end of a tag, we want to move to the next tag
            while (start_tag.End == formatStart && start_tag.Next != null)
            {
                start_tag = start_tag.Next;
            }

            if (start_tag.Start == formatStart && start_tag.Length == length)
            {
                SetFormat(start_tag, font, color, backColor, text_position, char_offset, visible, specified);
                return(retval);
            }

            // Break the tag if needed -- we don't need to break for the start if we're starting at its start.
            if (start_tag.Start != formatStart)
            {
                tag = start_tag.Break(formatStart);
            }
            else
            {
                tag = start_tag;
            }

            // empty selection style at end of line - its the only situation
            // where the rest of the tag would be empty, since we moved to the
            // begining of next non empty tag
            if (tag.Length == 0)
            {
                SetFormat(tag, font, color, backColor, text_position, char_offset, visible, specified);
                return(retval);
            }

            // empty - so we just create another tag for
            // after our new (now) empty one..
            if (length == 0)
            {
                tag.Break(formatStart);
                SetFormat(tag, font, color, backColor, text_position, char_offset, visible, specified);
                return(retval);
            }

            bool atEnd = false;

            while (tag != null && tag.End <= end)
            {
                SetFormat(tag, font, color, backColor, text_position, char_offset, visible, specified);
                atEnd |= tag.End == end;
                tag    = tag.next;
            }

            // did the last tag conveniently fit?
            if (atEnd || (tag != null && tag.End == end))
            {
                return(retval);
            }

            // Now do the last tag
            end_tag = FindTag(line, end - 1);

            if (end_tag != null)
            {
                end_tag.Break(end);
                SetFormat(end_tag, font, color, backColor, text_position, char_offset, visible, specified);
            }

            return(retval);
        }