Example #1
0
 /// <summary>
 /// hOCR v1.2 (HTML)
 /// <para>See http://kba.cloud/hocr-spec/1.2/ </para>
 /// </summary>
 public HOcrTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1.0, string indent = "\t")
 {
     this.wordExtractor = wordExtractor;
     this.pageSegmenter = pageSegmenter;
     this.scale         = scale;
     indentChar         = indent;
 }
        /// <summary>
        /// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
        /// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
        /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
        /// left or right edge of the page.</para>
        /// </summary>
        /// <param name="pages">The <see cref="Page"/>s in the document. All of them are needed for the algorithm to work.</param>
        /// <param name="wordExtractor"></param>
        /// <param name="pageSegmenter"></param>
        /// <param name="minimumEditDistanceNormalised">Minimum edit distance normalised. A value of 0 means both strings are exactly equal.</param>
        /// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
        /// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
        /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
        /// <para>A positive property value limits the number of concurrent operations to the set value.
        /// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
        public static IReadOnlyList <IReadOnlyList <TextBlock> > Get(IReadOnlyList <Page> pages,
                                                                     IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, Func <string, string, double> minimumEditDistanceNormalised,
                                                                     double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
        {
            if (pages.Count < 2)
            {
                throw new ArgumentException("The algorithm cannot be used with a document of less than 2 pages.", nameof(pages));
            }

            ConcurrentDictionary <int, IReadOnlyList <TextBlock> > pagesBlocks = new ConcurrentDictionary <int, IReadOnlyList <TextBlock> >();

            ParallelOptions parallelOptions = new ParallelOptions()
            {
                MaxDegreeOfParallelism = maxDegreeOfParallelism
            };

            Parallel.For(0, pages.Count(), parallelOptions, p =>
            {
                var words  = pages[p].GetWords(wordExtractor);
                var blocks = pageSegmenter.GetBlocks(words);
                if (!pagesBlocks.TryAdd(p, blocks))
                {
                    throw new ArgumentException("Cannot add element with index " + p + " in ConcurrentDictionary.");
                }
            });

            return(Get(pagesBlocks.OrderBy(x => x.Key).Select(x => x.Value).ToList(),
                       minimumEditDistanceNormalised,
                       similarityThreshold,
                       n,
                       maxDegreeOfParallelism));
        }
Example #3
0
 public hOCR(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, double scale = 1.0, string indent = "\t")
 {
     _wordExtractor = wordExtractor;
     _pageSegmenter = pageSegmenter;
     _scale         = (decimal)scale;
     _indent        = indent;
 }
Example #4
0
 /// <summary>
 /// Alto 4.1 (XML).
 /// <para>See https://github.com/altoxml/schema </para>
 /// </summary>
 /// <param name="wordExtractor">Extractor used to identify words in the document.</param>
 /// <param name="pageSegmenter">Segmenter used to split page into blocks.</param>
 /// <param name="scale">Scale multiplier to apply to output document, defaults to 1.</param>
 /// <param name="indent">Character to use for indentation, defaults to tab.</param>
 public AltoXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, decimal scale = 1.0m, string indent = "\t")
 {
     this.wordExtractor = wordExtractor ?? throw new ArgumentNullException(nameof(wordExtractor));
     this.pageSegmenter = pageSegmenter ?? throw new ArgumentNullException(nameof(pageSegmenter));
     this.scale         = scale;
     indentChar         = indent ?? string.Empty;
 }
Example #5
0
 /// <summary>
 /// PAGE-XML 2019-07-15 (XML) text exporter.
 /// <para>See https://github.com/PRImA-Research-Lab/PAGE-XML </para>
 /// </summary>
 /// <param name="wordExtractor"></param>
 /// <param name="pageSegmenter"></param>
 /// <param name="readingOrderDetector"></param>
 /// <param name="scale"></param>
 /// <param name="indent">Indent character.</param>
 public PageXmlTextExporter(IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, IReadingOrderDetector readingOrderDetector = null, double scale = 1.0, string indent = "\t")
 {
     this.wordExtractor        = wordExtractor;
     this.pageSegmenter        = pageSegmenter;
     this.readingOrderDetector = readingOrderDetector;
     this.scale      = scale;
     this.indentChar = indent;
 }
Example #6
0
 public TagCloudRenderer(IWordExtractor wordExtractor, IBlackList blackList, ITagCloudVizualizer vizualizer,
                         WordCloudConfig wordCloudConfig)
 {
     this.wordExtractor   = wordExtractor;
     this.blackList       = blackList;
     this.vizualizer      = vizualizer;
     this.wordCloudConfig = wordCloudConfig;
 }
 public NormalizationEnginePhase(
     INormalizer normalizer,
     IWordExtractor wordExtractor,
     IAppEnvironment <NormalizationEnginePhase> appEnvironment) : base(appEnvironment)
 {
     this.normalizer    = normalizer;
     this.wordExtractor = wordExtractor;
 }
Example #8
0
 public TagClodForm(IWordExtractor wordExtractor, IBlackList blackList, ITagCloudVizualizer vizualizer, WordCloudConfig wordCloudConfig)
 {
     this.wordExtractor = wordExtractor;
     this.blackList     = blackList;
     this.vizualizer    = vizualizer;
     InitializeComponent();
     this.wordCloudConfig = wordCloudConfig;
     pictureBox1.SizeMode = PictureBoxSizeMode.StretchImage;
 }
        public void SetWordExtractor(Type wordExtractor)
        {
            if (wordExtractor == null) return;

            try
            {
                this.wordExtractor = (IWordExtractor)Activator.CreateInstance(wordExtractor);
            }
            catch (Exception)
            {
                this.wordExtractor = (IWordExtractor)wordExtractor.GetMethod("get_Instance").Invoke(null, null);
            }
        }
Example #10
0
 public CloudPainter(
     IWordExtractor wordExtractor,
     IFormatter formatter,
     IFilter filter,
     IAnalysator lexicAnalysator,
     ICloudLayouter cloudLayouter,
     ITextVisualisator textVisualisator
     )
 {
     this.cloudLayouter    = cloudLayouter;
     this.lexicAnalysator  = lexicAnalysator;
     this.textVisualisator = textVisualisator;
     this.wordExtractor    = wordExtractor;
     this.formatter        = formatter;
     this.filter           = filter;
 }
Example #11
0
 public CloudPainter(
     IWordExtractor wordExtractor,
     IFormatter formatter,
     IFilter filter,
     IAnalysator lexicAnalysator,
     ICloudLayouter cloudLayouter,
     ITextVisualisator textVisualisator,
     ITextCleaner textCleaner
     )
 {
     _cloudLayouter    = cloudLayouter;
     _lexicAnalysator  = lexicAnalysator;
     _textVisualisator = textVisualisator;
     _wordExtractor    = wordExtractor;
     _formatter        = formatter;
     _filter           = filter;
     _textCleaner      = textCleaner;
 }
Example #12
0
 /// <summary>
 /// Use a custom <see cref="IWordExtractor"/> to get the words for this page.
 /// </summary>
 /// <param name="wordExtractor">The word extractor to use to generate words.</param>
 /// <returns>The words on this page.</returns>
 public IEnumerable <Word> GetWords(IWordExtractor wordExtractor)
 {
     return((wordExtractor ?? DefaultWordExtractor.Instance).GetWords(Letters));
 }
Example #13
0
 internal PdfPageModel(Page page)
 {
     this.page     = page;
     wordExtractor = NearestNeighbourWordExtractor.Instance;
     pageSegmenter = DocstrumBoundingBoxes.Instance;
 }
Example #14
0
 /// <summary>
 /// Perform indexation of documents <paramref name="allDocuments"/>.
 /// </summary>
 private static IReadOnlyDictionary <Term, DocLinks> PerformIndexation(
     IEnumerable <IDocument> allDocuments,
     IWordExtractor wordExtractor)
 => allDocuments
 .SelectMany(document => document
        public Word_Extractor()
        {
            InitializeComponent();

            wordExtractor = new WordExtractor();
        }
 /// <summary>
 /// Get blocks that are labelled as decoration for each page in the document, using a content and a geometric similarity measure.
 /// <para>Decoration blocks are blocks that contains information such as author names, publication titles, page numbers, etc.
 /// They are printed repeatedly at the border of each page, usually placed inside headers or footers, but sometimes also at the
 /// left or right edge of the page.</para>
 /// </summary>
 /// <param name="pages">The <see cref="Page"/>s in the document. All of them are needed for the algorithm to work.</param>
 /// <param name="wordExtractor"></param>
 /// <param name="pageSegmenter"></param>
 /// <param name="similarityThreshold">Minimum similarity score to decide wether a block is labelled as decoration or not.</param>
 /// <param name="n">Number of blocks in a page to be considered when looking for decoration blocks.</param>
 /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
 /// <para>A positive property value limits the number of concurrent operations to the set value.
 /// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
 public static IReadOnlyList <IReadOnlyList <TextBlock> > Get(IReadOnlyList <Page> pages,
                                                              IWordExtractor wordExtractor, IPageSegmenter pageSegmenter,
                                                              double similarityThreshold = 0.25, int n = 5, int maxDegreeOfParallelism = -1)
 {
     return(Get(pages, wordExtractor, pageSegmenter, Distances.MinimumEditDistanceNormalised, similarityThreshold, n, maxDegreeOfParallelism));
 }
Example #17
0
 public IndexingEnginePhase(
     IWordExtractor wordExtractor,
     IAppEnvironment <IndexingEnginePhase> appEnvironment) : base(appEnvironment)
     => this.wordExtractor = wordExtractor;
Example #18
0
        public IEnumerable <(string Prediction, float Score, TextBlock Block)> Classify(Page page, IWordExtractor wordExtractor, IPageSegmenter pageSegmenter)
        {
            var words  = wordExtractor.GetWords(page.Letters);
            var blocks = pageSegmenter.GetBlocks(words);

            foreach (var block in blocks)
            {
                var letters  = block.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters);
                var paths    = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths);
                var images   = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages());
                var features = FeatureHelper.GetFeatures(page, block.BoundingBox, letters, paths, images);

                BlockFeatures blockFeatures = new BlockFeatures()
                {
                    CharsCount           = features[0],
                    PctNumericChars      = features[1],
                    PctAlphabeticalChars = features[2],
                    PctSymbolicChars     = features[3],
                    PctBulletChars       = features[4],
                    DeltaToHeight        = features[5],
                    PathsCount           = features[6],
                    PctBezierPaths       = features[7],
                    PctHorPaths          = features[8],
                    PctVertPaths         = features[9],
                    PctOblPaths          = features[10],
                    ImagesCount          = features[11],
                    ImageAvgProportion   = features[12]
                };

                var result = predEngine.Predict(blockFeatures);

                yield return(FeatureHelper.Categories[(int)result.Prediction], result.Score.Max(), block);
            }
        }
Example #19
0
 /// <param name="allDocuments">
 /// Documents to be indexed.
 /// </param>
 /// <param name="wordExtractor">
 /// Word extractor.
 /// </param>
 public TermsIndex(
     IEnumerable <IDocument> allDocuments,
     IWordExtractor wordExtractor)
 => termsToDocuments = PerformIndexation(allDocuments, wordExtractor);
Example #20
0
 public WordComposer(IWordExtractor wordExtractor)
 {
     _wordExtractor = wordExtractor;
 }
Example #21
0
        public IEnumerable <(string Prediction, float Score, TextBlock Block)> Classify(Page page, IWordExtractor wordExtractor,
                                                                                        IPageSegmenter pageSegmenter, Bookmarks bookmarks = null)
        {
            List <DocumentBookmarkNode> bookmarksNodes = bookmarks?.GetNodes()
                                                         .Where(b => b is DocumentBookmarkNode)
                                                         .Select(b => b as DocumentBookmarkNode)
                                                         .Cast <DocumentBookmarkNode>()
                                                         .Where(b => b.PageNumber == page.Number).ToList();

            var avgPageFontHeight = page.Letters.Select(l => l.GlyphRectangle.Height).Average();

            var words  = wordExtractor.GetWords(page.Letters);
            var blocks = pageSegmenter.GetBlocks(words);

            foreach (var block in blocks)
            {
                var letters = block.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters);
                var paths   = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths);
                var images  = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages());

                var features = FeatureHelper.GetFeatures(
                    block, paths,
                    images, avgPageFontHeight,
                    block.BoundingBox.Area,
                    bookmarksNodes);

                BlockFeatures blockFeatures = new BlockFeatures()
                {
                    BlockAspectRatio     = features[0],
                    CharsCount           = features[1],
                    WordsCount           = features[2],
                    LinesCount           = features[3],
                    PctNumericChars      = features[4],
                    PctAlphabeticalChars = features[5],
                    PctSymbolicChars     = features[6],
                    PctBulletChars       = features[7],
                    DeltaToHeight        = features[8],
                    PathsCount           = features[9],
                    PctBezierPaths       = features[10],
                    PctHorPaths          = features[11],
                    PctVertPaths         = features[12],
                    PctOblPaths          = features[13],
                    ImagesCount          = features[14],
                    ImageAvgProportion   = features[15],
                    BestNormEditDistance = features[16],
                };

                var result = predEngine.Predict(blockFeatures);

                yield return(FeatureHelper.Categories[(int)result.Prediction], result.Score.Max(), block);
            }
        }
 public StatsCollectionSubphase(
     IWordExtractor wordExtractor,
     IAppEnvironment <StatsCollectionSubphase> appEnvironment) : base(appEnvironment)
     => this.wordExtractor = wordExtractor;