/// <summary> /// hOCR v1.2 (HTML) /// <para>See http://kba.cloud/hocr-spec/1.2/ </para> /// </summary> public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1.0, string indent = "\t") { this.wordExtractor = wordExtractor; this.pageSegmenter = pageSegmenter; this.scale = scale; indentChar = indent; }
/// <summary> /// Alto 4.1 (XML). /// <para>See https://github.com/altoxml/schema </para> /// </summary> /// <param name="wordExtractor">Extractor used to identify words in the document.</param> /// <param name="pageSegmenter">Segmenter used to split page into blocks.</param> /// <param name="scale">Scale multiplier to apply to output document, defaults to 1.</param> /// <param name="indent">Character to use for indentation, defaults to tab.</param> public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, decimal scale = 1.0m, string indent = "\t") { this.wordExtractor = wordExtractor ?? throw new ArgumentNullException(nameof(wordExtractor)); this.pageSegmenter = pageSegmenter ?? throw new ArgumentNullException(nameof(pageSegmenter)); this.scale = scale; indentChar = indent ?? string.Empty; }
public hOCR(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1.0, string indent = "\t") { _wordExtractor = wordExtractor; _pageSegmenter = pageSegmenter; _scale = (decimal)scale; _indent = indent; }
/// <summary> /// PAGE-XML 2019-07-15 (XML) text exporter. /// <para>See https://github.com/PRImA-Research-Lab/PAGE-XML </para> /// </summary> /// <param name="wordExtractor"></param> /// <param name="pageSegmenter"></param> /// <param name="readingOrderDetector"></param> /// <param name="scale"></param> /// <param name="indent">Indent character.</param> public PageXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, IReadingOrderDetector readingOrderDetector = null, double scale = 1.0, string indent = "\t") { this.wordExtractor = wordExtractor; this.pageSegmenter = pageSegmenter; this.readingOrderDetector = readingOrderDetector; this.scale = scale; this.indentChar = indent; }
public void SetPageSegmenter(Type pageSegmenter) { if (pageSegmenter == null) { return; } this.pageSegmenter = (IPageSegmenter)Activator.CreateInstance(pageSegmenter); }
internal PdfPageModel(Page page) { this.page = page; wordExtractor = NearestNeighbourWordExtractor.Instance; pageSegmenter = DocstrumBoundingBoxes.Instance; }
/// <summary> /// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure. /// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc. /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the /// left or right edge of the page.</para> /// </summary> /// <param name="pages">The <see cref="Page"/>s in the document. All of them are needed for the algorithm to work.</param> /// <param name="wordExtractor"></param> /// <param name="pageSegmenter"></param> /// <param name="minimumEditDistanceNormalised">Minimum edit distance normalised. A value of 0 means both strings are exactly equal.</param> /// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param> /// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param> /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled. /// <para>A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations.</para></param> public static IReadOnlyList <IReadOnlyList <TextBlock> > Get(IReadOnlyList <Page> pages, IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, Func <string, string, double> minimumEditDistanceNormalised, double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1) { if (pages.Count < 2) { throw new ArgumentException("The algorithm cannot be used with a document of less than 2 pages.", nameof(pages)); } ConcurrentDictionary <int, IReadOnlyList <TextBlock> > pagesBlocks = new ConcurrentDictionary <int, IReadOnlyList <TextBlock> >(); ParallelOptions parallelOptions = new ParallelOptions() { MaxDegreeOfParallelism = maxDegreeOfParallelism }; Parallel.For(0, pages.Count(), parallelOptions, p => { var words = pages[p].GetWords(wordExtractor); var blocks = pageSegmenter.GetBlocks(words); if (!pagesBlocks.TryAdd(p, blocks)) { throw new ArgumentException("Cannot add element with index " + p + " in ConcurrentDictionary."); } }); return(Get(pagesBlocks.OrderBy(x => x.Key).Select(x => x.Value).ToList(), minimumEditDistanceNormalised, similarityThreshold, n, maxDegreeOfParallelism)); }
/// <summary> /// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure. /// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc. /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the /// left or right edge of the page.</para> /// </summary> /// <param name="pages">The <see cref="Page"/>s in the document. All of them are needed for the algorithm to work.</param> /// <param name="wordExtractor"></param> /// <param name="pageSegmenter"></param> /// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param> /// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param> /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled. /// <para>A positive property value limits the number of concurrent operations to the set value. /// If it is -1, there is no limit on the number of concurrently running operations.</para></param> public static IReadOnlyList <IReadOnlyList <TextBlock> > Get(IReadOnlyList <Page> pages, IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1) { return(Get(pages, wordExtractor, pageSegmenter, Distances.MinimumEditDistanceNormalised, similarityThreshold, n, maxDegreeOfParallelism)); }
public IEnumerable <(string Prediction, float Score, TextBlock Block)> Classify(Page page, IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, Bookmarks bookmarks = null) { List <DocumentBookmarkNode> bookmarksNodes = bookmarks?.GetNodes() .Where(b => b is DocumentBookmarkNode) .Select(b => b as DocumentBookmarkNode) .Cast <DocumentBookmarkNode>() .Where(b => b.PageNumber == page.Number).ToList(); var avgPageFontHeight = page.Letters.Select(l => l.GlyphRectangle.Height).Average(); var words = wordExtractor.GetWords(page.Letters); var blocks = pageSegmenter.GetBlocks(words); foreach (var block in blocks) { var letters = block.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters); var paths = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths); var images = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages()); var features = FeatureHelper.GetFeatures( block, paths, images, avgPageFontHeight, block.BoundingBox.Area, bookmarksNodes); BlockFeatures blockFeatures = new BlockFeatures() { BlockAspectRatio = features[0], CharsCount = features[1], WordsCount = features[2], LinesCount = features[3], PctNumericChars = features[4], PctAlphabeticalChars = features[5], PctSymbolicChars = features[6], PctBulletChars = features[7], DeltaToHeight = features[8], PathsCount = features[9], PctBezierPaths = features[10], PctHorPaths = features[11], PctVertPaths = features[12], PctOblPaths = features[13], ImagesCount = features[14], ImageAvgProportion = features[15], BestNormEditDistance = features[16], }; var result = predEngine.Predict(blockFeatures); yield return(FeatureHelper.Categories[(int)result.Prediction], result.Score.Max(), block); } }
public IEnumerable <(string Prediction, float Score, TextBlock Block)> Classify(Page page, IWordExtractor wordExtractor, IPageSegmenter pageSegmenter) { var words = wordExtractor.GetWords(page.Letters); var blocks = pageSegmenter.GetBlocks(words); foreach (var block in blocks) { var letters = block.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters); var paths = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths); var images = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages()); var features = FeatureHelper.GetFeatures(page, block.BoundingBox, letters, paths, images); BlockFeatures blockFeatures = new BlockFeatures() { CharsCount = features[0], PctNumericChars = features[1], PctAlphabeticalChars = features[2], PctSymbolicChars = features[3], PctBulletChars = features[4], DeltaToHeight = features[5], PathsCount = features[6], PctBezierPaths = features[7], PctHorPaths = features[8], PctVertPaths = features[9], PctOblPaths = features[10], ImagesCount = features[11], ImageAvgProportion = features[12] }; var result = predEngine.Predict(blockFeatures); yield return(FeatureHelper.Categories[(int)result.Prediction], result.Score.Max(), block); } }