Exemplo n.º 1
0
 /// <summary>
 /// hOCR v1.2 (HTML)
 /// <para>See http://kba.cloud/hocr-spec/1.2/ </para>
 /// </summary>
 public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1.0, string indent = "\t")
 {
     this.wordExtractor = wordExtractor;
     this.pageSegmenter = pageSegmenter;
     this.scale         = scale;
     indentChar         = indent;
 }
Exemplo n.º 2
0
 /// <summary>
 /// Alto 4.1 (XML).
 /// <para>See https://github.com/altoxml/schema </para>
 /// </summary>
 /// <param name="wordExtractor">Extractor used to identify words in the document.</param>
 /// <param name="pageSegmenter">Segmenter used to split page into blocks.</param>
 /// <param name="scale">Scale multiplier to apply to output document, defaults to 1.</param>
 /// <param name="indent">Character to use for indentation, defaults to tab.</param>
 public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, decimal scale = 1.0m, string indent = "\t")
 {
     this.wordExtractor = wordExtractor ?? throw new ArgumentNullException(nameof(wordExtractor));
     this.pageSegmenter = pageSegmenter ?? throw new ArgumentNullException(nameof(pageSegmenter));
     this.scale         = scale;
     indentChar         = indent ?? string.Empty;
 }
Exemplo n.º 3
0
 public hOCR(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1.0, string indent = "\t")
 {
     _wordExtractor = wordExtractor;
     _pageSegmenter = pageSegmenter;
     _scale         = (decimal)scale;
     _indent        = indent;
 }
Exemplo n.º 4
0
 /// <summary>
 /// PAGE-XML 2019-07-15 (XML) text exporter.
 /// <para>See https://github.com/PRImA-Research-Lab/PAGE-XML </para>
 /// </summary>
 /// <param name="wordExtractor"></param>
 /// <param name="pageSegmenter"></param>
 /// <param name="readingOrderDetector"></param>
 /// <param name="scale"></param>
 /// <param name="indent">Indent character.</param>
 public PageXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, IReadingOrderDetector readingOrderDetector = null, double scale = 1.0, string indent = "\t")
 {
     this.wordExtractor        = wordExtractor;
     this.pageSegmenter        = pageSegmenter;
     this.readingOrderDetector = readingOrderDetector;
     this.scale      = scale;
     this.indentChar = indent;
 }
Exemplo n.º 5
0
 public void SetPageSegmenter(Type pageSegmenter)
 {
     if (pageSegmenter == null)
     {
         return;
     }
     this.pageSegmenter = (IPageSegmenter)Activator.CreateInstance(pageSegmenter);
 }
Exemplo n.º 6
0
 internal PdfPageModel(Page page)
 {
     this.page     = page;
     wordExtractor = NearestNeighbourWordExtractor.Instance;
     pageSegmenter = DocstrumBoundingBoxes.Instance;
 }
        /// <summary>
        /// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
        /// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
        /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
        /// left or right edge of the page.</para>
        /// </summary>
        /// <param name="pages">The <see cref="Page"/>s in the document. All of them are needed for the algorithm to work.</param>
        /// <param name="wordExtractor"></param>
        /// <param name="pageSegmenter"></param>
        /// <param name="minimumEditDistanceNormalised">Minimum edit distance normalised. A value of 0 means both strings are exactly equal.</param>
        /// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
        /// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
        /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
        /// <para>A positive property value limits the number of concurrent operations to the set value.
        /// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
        public static IReadOnlyList <IReadOnlyList <TextBlock> > Get(IReadOnlyList <Page> pages,
                                                                     IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, Func <string, string, double> minimumEditDistanceNormalised,
                                                                     double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
        {
            if (pages.Count < 2)
            {
                throw new ArgumentException("The algorithm cannot be used with a document of less than 2 pages.", nameof(pages));
            }

            ConcurrentDictionary <int, IReadOnlyList <TextBlock> > pagesBlocks = new ConcurrentDictionary <int, IReadOnlyList <TextBlock> >();

            ParallelOptions parallelOptions = new ParallelOptions()
            {
                MaxDegreeOfParallelism = maxDegreeOfParallelism
            };

            Parallel.For(0, pages.Count(), parallelOptions, p =>
            {
                var words  = pages[p].GetWords(wordExtractor);
                var blocks = pageSegmenter.GetBlocks(words);
                if (!pagesBlocks.TryAdd(p, blocks))
                {
                    throw new ArgumentException("Cannot add element with index " + p + " in ConcurrentDictionary.");
                }
            });

            return(Get(pagesBlocks.OrderBy(x => x.Key).Select(x => x.Value).ToList(),
                       minimumEditDistanceNormalised,
                       similarityThreshold,
                       n,
                       maxDegreeOfParallelism));
        }
 /// <summary>
 /// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
 /// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
 /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
 /// left or right edge of the page.</para>
 /// </summary>
 /// <param name="pages">The <see cref="Page"/>s in the document. All of them are needed for the algorithm to work.</param>
 /// <param name="wordExtractor"></param>
 /// <param name="pageSegmenter"></param>
 /// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
 /// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
 /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
 /// <para>A positive property value limits the number of concurrent operations to the set value.
 /// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
 public static IReadOnlyList <IReadOnlyList <TextBlock> > Get(IReadOnlyList <Page> pages,
                                                              IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
                                                              double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
 {
     return(Get(pages, wordExtractor, pageSegmenter, Distances.MinimumEditDistanceNormalised, similarityThreshold, n, maxDegreeOfParallelism));
 }
Exemplo n.º 9
0
        public IEnumerable <(string Prediction, float Score, TextBlock Block)> Classify(Page page, IWordExtractor wordExtractor,
                                                                                        IPageSegmenter pageSegmenter, Bookmarks bookmarks = null)
        {
            List <DocumentBookmarkNode> bookmarksNodes = bookmarks?.GetNodes()
                                                         .Where(b => b is DocumentBookmarkNode)
                                                         .Select(b => b as DocumentBookmarkNode)
                                                         .Cast <DocumentBookmarkNode>()
                                                         .Where(b => b.PageNumber == page.Number).ToList();

            var avgPageFontHeight = page.Letters.Select(l => l.GlyphRectangle.Height).Average();

            var words  = wordExtractor.GetWords(page.Letters);
            var blocks = pageSegmenter.GetBlocks(words);

            foreach (var block in blocks)
            {
                var letters = block.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters);
                var paths   = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths);
                var images  = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages());

                var features = FeatureHelper.GetFeatures(
                    block, paths,
                    images, avgPageFontHeight,
                    block.BoundingBox.Area,
                    bookmarksNodes);

                BlockFeatures blockFeatures = new BlockFeatures()
                {
                    BlockAspectRatio     = features[0],
                    CharsCount           = features[1],
                    WordsCount           = features[2],
                    LinesCount           = features[3],
                    PctNumericChars      = features[4],
                    PctAlphabeticalChars = features[5],
                    PctSymbolicChars     = features[6],
                    PctBulletChars       = features[7],
                    DeltaToHeight        = features[8],
                    PathsCount           = features[9],
                    PctBezierPaths       = features[10],
                    PctHorPaths          = features[11],
                    PctVertPaths         = features[12],
                    PctOblPaths          = features[13],
                    ImagesCount          = features[14],
                    ImageAvgProportion   = features[15],
                    BestNormEditDistance = features[16],
                };

                var result = predEngine.Predict(blockFeatures);

                yield return(FeatureHelper.Categories[(int)result.Prediction], result.Score.Max(), block);
            }
        }
Exemplo n.º 10
0
        public IEnumerable <(string Prediction, float Score, TextBlock Block)> Classify(Page page, IWordExtractor wordExtractor, IPageSegmenter pageSegmenter)
        {
            var words  = wordExtractor.GetWords(page.Letters);
            var blocks = pageSegmenter.GetBlocks(words);

            foreach (var block in blocks)
            {
                var letters  = block.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters);
                var paths    = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths);
                var images   = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages());
                var features = FeatureHelper.GetFeatures(page, block.BoundingBox, letters, paths, images);

                BlockFeatures blockFeatures = new BlockFeatures()
                {
                    CharsCount           = features[0],
                    PctNumericChars      = features[1],
                    PctAlphabeticalChars = features[2],
                    PctSymbolicChars     = features[3],
                    PctBulletChars       = features[4],
                    DeltaToHeight        = features[5],
                    PathsCount           = features[6],
                    PctBezierPaths       = features[7],
                    PctHorPaths          = features[8],
                    PctVertPaths         = features[9],
                    PctOblPaths          = features[10],
                    ImagesCount          = features[11],
                    ImageAvgProportion   = features[12]
                };

                var result = predEngine.Predict(blockFeatures);

                yield return(FeatureHelper.Categories[(int)result.Prediction], result.Score.Max(), block);
            }
        }