/// <summary> /// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure. /// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc. /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the /// left or right edge of the page.</para> /// </summary> /// <param name="pages">The <see cref="Page"/>s in the document. All of them are needed for the algorithm to work.</param> /// <param name="wordExtractor"></param> /// <param name="pageSegmenter"></param> /// <param name="minimumEditDistanceNormalised">Minimum edit distance normalised. A value of 0 means both strings are exactly equal.</param> /// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param> /// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param> /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled. /// <para>A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations.</para></param> public static IReadOnlyList <IReadOnlyList <TextBlock> > Get(IReadOnlyList <Page> pages, IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, Func <string, string, double> minimumEditDistanceNormalised, double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1) { if (pages.Count < 2) { throw new ArgumentException("The algorithm cannot be used with a document of less than 2 pages.", nameof(pages)); } ConcurrentDictionary <int, IReadOnlyList <TextBlock> > pagesBlocks = new ConcurrentDictionary <int, IReadOnlyList <TextBlock> >(); ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; Parallel.For(0, pages.Count(), parallelOptions, p => { var words = pages[p].GetWords(wordExtractor); var blocks = pageSegmenter.GetBlocks(words); if (!pagesBlocks.TryAdd(p, blocks)) { throw new ArgumentException("Cannot add element with index " + p + " in ConcurrentDictionary."); } }); return(Get(pagesBlocks.OrderBy(x => x.Key).Select(x => x.Value).ToList(), minimumEditDistanceNormalised, similarityThreshold, n, maxDegreeOfParallelism)); }
private string GetCode(Page page, string imageName = "unknown") { pageCount++; imageName = Path.GetFileName(imageName); string hocr = _indent + @"<div class='ocr_page' id='page_" + page.Number.ToString() + "' title='image \"" + imageName + "\"; bbox 0 0 " + (int)Math.Round(page.Width * _scale) + " " + (int)Math.Round(page.Height * _scale) + "; ppageno " + (page.Number - 1) + "\'>"; foreach (var path in page.ExperimentalAccess.Paths) { hocr += "\n" + GetCode(path, page.Height, true); } var words = page.GetWords(_wordExtractor); if (words.Count() > 0) { var blocks = _pageSegmenter.GetBlocks(words); foreach (var block in blocks) { hocr += "\n" + GetCode(block, page.Height); } } hocr += "\n" + _indent + @"</div>"; return(hocr); }
public IEnumerable <(string Prediction, float Score, TextBlock Block)> Classify(Page page, IWordExtractor wordExtractor, IPageSegmenter pageSegmenter) { var words = wordExtractor.GetWords(page.Letters); var blocks = pageSegmenter.GetBlocks(words); foreach (var block in blocks) { var letters = block.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters); var paths = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths); var images = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages()); var features = FeatureHelper.GetFeatures(page, block.BoundingBox, letters, paths, images); BlockFeatures blockFeatures = new BlockFeatures() { CharsCount = features[0], PctNumericChars = features[1], PctAlphabeticalChars = features[2], PctSymbolicChars = features[3], PctBulletChars = features[4], DeltaToHeight = features[5], PathsCount = features[6], PctBezierPaths = features[7], PctHorPaths = features[8], PctVertPaths = features[9], PctOblPaths = features[10], ImagesCount = features[11], ImageAvgProportion = features[12] }; var result = predEngine.Predict(blockFeatures); yield return(FeatureHelper.Categories[(int)result.Prediction], result.Score.Max(), block); } }
private AltoDocument.AltoPage ToAltoPage(Page page, bool includePaths) { pageCount = page.Number; pageSpaceCount++; var altoPage = new AltoDocument.AltoPage { Height = (float)Math.Round(page.Height * scale), Width = (float)Math.Round(page.Width * scale), Accuracy = float.NaN, Quality = AltoDocument.AltoQuality.OK, QualityDetail = null, BottomMargin = null, LeftMargin = null, RightMargin = null, TopMargin = null, Pc = float.NaN, PhysicalImgNr = page.Number, PrintedImgNr = null, PageClass = null, Position = AltoDocument.AltoPosition.Cover, Processing = null, ProcessingRefs = null, StyleRefs = null, PrintSpace = new AltoDocument.AltoPageSpace() { Height = (float)Math.Round(page.Height * scale), // TBD Width = (float)Math.Round(page.Width * scale), // TBD VerticalPosition = 0f, // TBD HorizontalPosition = 0f, // TBD ComposedBlocks = null, // TBD GraphicalElements = null, // TBD Illustrations = null, // TBD ProcessingRefs = null, // TBD StyleRefs = null, // TBD Id = "P" + pageCount + "_PS" + pageSpaceCount.ToString("#00000") }, Id = "P" + pageCount }; var words = page.GetWords(wordExtractor); var blocks = pageSegmenter.GetBlocks(words).Select(b => ToAltoTextBlock(b, page.Height)).ToArray(); altoPage.PrintSpace.TextBlock = blocks; altoPage.PrintSpace.Illustrations = page.GetImages().Select(i => ToAltoIllustration(i, page.Height)).ToArray(); if (includePaths) { altoPage.PrintSpace.GraphicalElements = page.ExperimentalAccess.Paths .Select(p => ToAltoGraphicalElement(p, page.Height)) .ToArray(); } return(altoPage); }
private PageXmlDocument.PageXmlPage ToPageXmlPage(Page page, bool includePaths) { var pageXmlPage = new PageXmlDocument.PageXmlPage() { ImageFilename = "unknown", ImageHeight = (int)Math.Round(page.Height * scale), ImageWidth = (int)Math.Round(page.Width * scale), }; var regions = new List <PageXmlDocument.PageXmlRegion>(); var words = page.GetWords(wordExtractor).ToList(); if (words.Count > 0) { var blocks = pageSegmenter.GetBlocks(words); if (readingOrderDetector != null) { blocks = readingOrderDetector.Get(blocks).ToList(); } regions.AddRange(blocks.Select(b => ToPageXmlTextRegion(b, page.Width, page.Height))); if (orderedRegions.Count > 0) { pageXmlPage.ReadingOrder = new PageXmlDocument.PageXmlReadingOrder() { Item = new PageXmlDocument.PageXmlOrderedGroup() { Items = orderedRegions.ToArray(), Id = "g" + groupOrderCount++ } }; } } var images = page.GetImages().ToList(); if (images.Count > 0) { regions.AddRange(images.Select(i => ToPageXmlImageRegion(i, page.Width, page.Height))); } if (includePaths) { var graphicalElements = page.ExperimentalAccess.Paths.Select(p => ToPageXmlLineDrawingRegion(p, page.Width, page.Height)); if (graphicalElements.Count(g => g != null) > 0) { regions.AddRange(graphicalElements.Where(g => g != null)); } } pageXmlPage.Items = regions.ToArray(); return(pageXmlPage); }
public IEnumerable <(string Prediction, float Score, TextBlock Block)> Classify(Page page, IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, Bookmarks bookmarks = null) { List <DocumentBookmarkNode> bookmarksNodes = bookmarks?.GetNodes() .Where(b => b is DocumentBookmarkNode) .Select(b => b as DocumentBookmarkNode) .Cast <DocumentBookmarkNode>() .Where(b => b.PageNumber == page.Number).ToList(); var avgPageFontHeight = page.Letters.Select(l => l.GlyphRectangle.Height).Average(); var words = wordExtractor.GetWords(page.Letters); var blocks = pageSegmenter.GetBlocks(words); foreach (var block in blocks) { var letters = block.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters); var paths = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths); var images = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages()); var features = FeatureHelper.GetFeatures( block, paths, images, avgPageFontHeight, block.BoundingBox.Area, bookmarksNodes); BlockFeatures blockFeatures = new BlockFeatures() { BlockAspectRatio = features[0], CharsCount = features[1], WordsCount = features[2], LinesCount = features[3], PctNumericChars = features[4], PctAlphabeticalChars = features[5], PctSymbolicChars = features[6], PctBulletChars = features[7], DeltaToHeight = features[8], PathsCount = features[9], PctBezierPaths = features[10], PctHorPaths = features[11], PctVertPaths = features[12], PctOblPaths = features[13], ImagesCount = features[14], ImageAvgProportion = features[15], BestNormEditDistance = features[16], }; var result = predEngine.Predict(blockFeatures); yield return(FeatureHelper.Categories[(int)result.Prediction], result.Score.Max(), block); } }
/// <summary> /// Get the hOCR string for the page. /// <para>http://kba.cloud/hocr-spec/1.2/#elementdef-ocr_page</para> /// </summary> /// <param name="page"></param> /// <param name="imageName"></param> /// <param name="includePaths">Draw <see cref="PdfPath"/>s present in the page.</param> private string GetCode(Page page, bool includePaths, string imageName = "unknown") { pageCount++; int level = 2; string hocr = GetIndent(level) + @"<div class='ocr_page' id='page_" + page.Number.ToString() + "' title='image \"" + imageName + "\"; bbox 0 0 " + (int)Math.Round(page.Width * scale) + " " + (int)Math.Round(page.Height * scale) + "; ppageno " + (page.Number - 1) + "\'>"; if (includePaths) { foreach (var path in page.ExperimentalAccess.Paths) { hocr += "\n" + GetCode(path, page.Height, true, level + 1); } } foreach (var image in page.GetImages()) { hocr += "\n" + GetCode(image, page.Height, level + 1); } var words = page.GetWords(wordExtractor); if (words.Count() > 0) { var blocks = pageSegmenter.GetBlocks(words); foreach (var block in blocks) { hocr += "\n" + GetCodeArea(block, page.Height, level + 1); } } hocr += "\n" + GetIndent(level) + @"</div>"; return(hocr); }
public IEnumerable <TextBlock> GetTextBlocks() { return(pageSegmenter.GetBlocks(GetWords())); }