/// <summary>
        /// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
        /// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
        /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
        /// left or right edge of the page.</para>
        /// </summary>
        /// <param name="pages">The <see cref="Page"/>s in the document. All of them are needed for the algorithm to work.</param>
        /// <param name="wordExtractor"></param>
        /// <param name="pageSegmenter"></param>
        /// <param name="minimumEditDistanceNormalised">Minimum edit distance normalised. A value of 0 means both strings are exactly equal.</param>
        /// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
        /// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
        /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
        /// <para>A positive property value limits the number of concurrent operations to the set value.
        /// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
        public static IReadOnlyList <IReadOnlyList <TextBlock> > Get(IReadOnlyList <Page> pages,
                                                                     IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, Func <string, string, double> minimumEditDistanceNormalised,
                                                                     double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
        {
            if (pages.Count < 2)
            {
                throw new ArgumentException("The algorithm cannot be used with a document of less than 2 pages.", nameof(pages));
            }

            ConcurrentDictionary <int, IReadOnlyList <TextBlock> > pagesBlocks = new ConcurrentDictionary <int, IReadOnlyList <TextBlock> >();

            ParallelOptions parallelOptions = new ParallelOptions()
            {
                MaxDegreeOfParallelism = maxDegreeOfParallelism
            };

            Parallel.For(0, pages.Count(), parallelOptions, p =>
            {
                var words  = pages[p].GetWords(wordExtractor);
                var blocks = pageSegmenter.GetBlocks(words);
                if (!pagesBlocks.TryAdd(p, blocks))
                {
                    throw new ArgumentException("Cannot add element with index " + p + " in ConcurrentDictionary.");
                }
            });

            return(Get(pagesBlocks.OrderBy(x => x.Key).Select(x => x.Value).ToList(),
                       minimumEditDistanceNormalised,
                       similarityThreshold,
                       n,
                       maxDegreeOfParallelism));
        }
        private string GetCode(Page page, string imageName = "unknown")
        {
            pageCount++;
            imageName = Path.GetFileName(imageName);
            string hocr = _indent + @"<div class='ocr_page' id='page_" + page.Number.ToString() +
                          "' title='image \"" + imageName + "\"; bbox 0 0 " +
                          (int)Math.Round(page.Width * _scale) + " " + (int)Math.Round(page.Height * _scale) +
                          "; ppageno " + (page.Number - 1) + "\'>";

            foreach (var path in page.ExperimentalAccess.Paths)
            {
                hocr += "\n" + GetCode(path, page.Height, true);
            }

            var words = page.GetWords(_wordExtractor);

            if (words.Count() > 0)
            {
                var blocks = _pageSegmenter.GetBlocks(words);
                foreach (var block in blocks)
                {
                    hocr += "\n" + GetCode(block, page.Height);
                }
            }

            hocr += "\n" + _indent + @"</div>";
            return(hocr);
        }
Beispiel #3
0
        public IEnumerable <(string Prediction, float Score, TextBlock Block)> Classify(Page page, IWordExtractor wordExtractor, IPageSegmenter pageSegmenter)
        {
            var words  = wordExtractor.GetWords(page.Letters);
            var blocks = pageSegmenter.GetBlocks(words);

            foreach (var block in blocks)
            {
                var letters  = block.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters);
                var paths    = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths);
                var images   = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages());
                var features = FeatureHelper.GetFeatures(page, block.BoundingBox, letters, paths, images);

                BlockFeatures blockFeatures = new BlockFeatures()
                {
                    CharsCount           = features[0],
                    PctNumericChars      = features[1],
                    PctAlphabeticalChars = features[2],
                    PctSymbolicChars     = features[3],
                    PctBulletChars       = features[4],
                    DeltaToHeight        = features[5],
                    PathsCount           = features[6],
                    PctBezierPaths       = features[7],
                    PctHorPaths          = features[8],
                    PctVertPaths         = features[9],
                    PctOblPaths          = features[10],
                    ImagesCount          = features[11],
                    ImageAvgProportion   = features[12]
                };

                var result = predEngine.Predict(blockFeatures);

                yield return(FeatureHelper.Categories[(int)result.Prediction], result.Score.Max(), block);
            }
        }
        private AltoDocument.AltoPage ToAltoPage(Page page, bool includePaths)
        {
            pageCount = page.Number;
            pageSpaceCount++;

            var altoPage = new AltoDocument.AltoPage
            {
                Height         = (float)Math.Round(page.Height * scale),
                Width          = (float)Math.Round(page.Width * scale),
                Accuracy       = float.NaN,
                Quality        = AltoDocument.AltoQuality.OK,
                QualityDetail  = null,
                BottomMargin   = null,
                LeftMargin     = null,
                RightMargin    = null,
                TopMargin      = null,
                Pc             = float.NaN,
                PhysicalImgNr  = page.Number,
                PrintedImgNr   = null,
                PageClass      = null,
                Position       = AltoDocument.AltoPosition.Cover,
                Processing     = null,
                ProcessingRefs = null,
                StyleRefs      = null,
                PrintSpace     = new AltoDocument.AltoPageSpace()
                {
                    Height             = (float)Math.Round(page.Height * scale), // TBD
                    Width              = (float)Math.Round(page.Width * scale),  // TBD
                    VerticalPosition   = 0f,                                     // TBD
                    HorizontalPosition = 0f,                                     // TBD
                    ComposedBlocks     = null,                                   // TBD
                    GraphicalElements  = null,                                   // TBD
                    Illustrations      = null,                                   // TBD
                    ProcessingRefs     = null,                                   // TBD
                    StyleRefs          = null,                                   // TBD
                    Id = "P" + pageCount + "_PS" + pageSpaceCount.ToString("#00000")
                },
                Id = "P" + pageCount
            };

            var words  = page.GetWords(wordExtractor);
            var blocks = pageSegmenter.GetBlocks(words).Select(b => ToAltoTextBlock(b, page.Height)).ToArray();

            altoPage.PrintSpace.TextBlock = blocks;

            altoPage.PrintSpace.Illustrations = page.GetImages().Select(i => ToAltoIllustration(i, page.Height)).ToArray();

            if (includePaths)
            {
                altoPage.PrintSpace.GraphicalElements = page.ExperimentalAccess.Paths
                                                        .Select(p => ToAltoGraphicalElement(p, page.Height))
                                                        .ToArray();
            }

            return(altoPage);
        }
Beispiel #5
0
        private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, bool includePaths)
        {
            var pageXmlPage = new PageXmlDocument.PageXmlPage()
            {
                ImageFilename = "unknown",
                ImageHeight   = (int)Math.Round(page.Height * scale),
                ImageWidth    = (int)Math.Round(page.Width * scale),
            };

            var regions = new List <PageXmlDocument.PageXmlRegion>();

            var words = page.GetWords(wordExtractor).ToList();

            if (words.Count > 0)
            {
                var blocks = pageSegmenter.GetBlocks(words);

                if (readingOrderDetector != null)
                {
                    blocks = readingOrderDetector.Get(blocks).ToList();
                }

                regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Width, page.Height)));

                if (orderedRegions.Count > 0)
                {
                    pageXmlPage.ReadingOrder = new PageXmlDocument.PageXmlReadingOrder()
                    {
                        Item = new PageXmlDocument.PageXmlOrderedGroup()
                        {
                            Items = orderedRegions.ToArray(),
                            Id    = "g" + groupOrderCount++
                        }
                    };
                }
            }

            var images = page.GetImages().ToList();

            if (images.Count > 0)
            {
                regions.AddRange(images.Select(i => ToPageXmlImageRegion(i, page.Width, page.Height)));
            }

            if (includePaths)
            {
                var graphicalElements = page.ExperimentalAccess.Paths.Select(p => ToPageXmlLineDrawingRegion(p, page.Width, page.Height));
                if (graphicalElements.Count(g => g != null) > 0)
                {
                    regions.AddRange(graphicalElements.Where(g => g != null));
                }
            }

            pageXmlPage.Items = regions.ToArray();
            return(pageXmlPage);
        }
Beispiel #6
0
        public IEnumerable <(string Prediction, float Score, TextBlock Block)> Classify(Page page, IWordExtractor wordExtractor,
                                                                                        IPageSegmenter pageSegmenter, Bookmarks bookmarks = null)
        {
            List <DocumentBookmarkNode> bookmarksNodes = bookmarks?.GetNodes()
                                                         .Where(b => b is DocumentBookmarkNode)
                                                         .Select(b => b as DocumentBookmarkNode)
                                                         .Cast <DocumentBookmarkNode>()
                                                         .Where(b => b.PageNumber == page.Number).ToList();

            var avgPageFontHeight = page.Letters.Select(l => l.GlyphRectangle.Height).Average();

            var words  = wordExtractor.GetWords(page.Letters);
            var blocks = pageSegmenter.GetBlocks(words);

            foreach (var block in blocks)
            {
                var letters = block.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters);
                var paths   = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths);
                var images  = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages());

                var features = FeatureHelper.GetFeatures(
                    block, paths,
                    images, avgPageFontHeight,
                    block.BoundingBox.Area,
                    bookmarksNodes);

                BlockFeatures blockFeatures = new BlockFeatures()
                {
                    BlockAspectRatio     = features[0],
                    CharsCount           = features[1],
                    WordsCount           = features[2],
                    LinesCount           = features[3],
                    PctNumericChars      = features[4],
                    PctAlphabeticalChars = features[5],
                    PctSymbolicChars     = features[6],
                    PctBulletChars       = features[7],
                    DeltaToHeight        = features[8],
                    PathsCount           = features[9],
                    PctBezierPaths       = features[10],
                    PctHorPaths          = features[11],
                    PctVertPaths         = features[12],
                    PctOblPaths          = features[13],
                    ImagesCount          = features[14],
                    ImageAvgProportion   = features[15],
                    BestNormEditDistance = features[16],
                };

                var result = predEngine.Predict(blockFeatures);

                yield return(FeatureHelper.Categories[(int)result.Prediction], result.Score.Max(), block);
            }
        }
Beispiel #7
0
        /// <summary>
        /// Get the hOCR string for the page.
        /// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_page</para>
        /// </summary>
        /// <param name="page"></param>
        /// <param name="imageName"></param>
        /// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param>
        private string GetCode(Page page, bool includePaths, string imageName = "unknown")
        {
            pageCount++;
            int level = 2;

            string hocr = GetIndent(level) + @"<div class='ocr_page' id='page_" + page.Number.ToString() +
                          "' title='image \"" + imageName + "\"; bbox 0 0 " +
                          (int)Math.Round(page.Width * scale) + " " + (int)Math.Round(page.Height * scale) +
                          "; ppageno " + (page.Number - 1) + "\'>";

            if (includePaths)
            {
                foreach (var path in page.ExperimentalAccess.Paths)
                {
                    hocr += "\n" + GetCode(path, page.Height, true, level + 1);
                }
            }

            foreach (var image in page.GetImages())
            {
                hocr += "\n" + GetCode(image, page.Height, level + 1);
            }

            var words = page.GetWords(wordExtractor);

            if (words.Count() > 0)
            {
                var blocks = pageSegmenter.GetBlocks(words);
                foreach (var block in blocks)
                {
                    hocr += "\n" + GetCodeArea(block, page.Height, level + 1);
                }
            }

            hocr += "\n" + GetIndent(level) + @"</div>";
            return(hocr);
        }
Beispiel #8
0
 public IEnumerable <TextBlock> GetTextBlocks()
 {
     return(pageSegmenter.GetBlocks(GetWords()));
 }