コード例 #1
0
        public IEnumerable <(string Prediction, float Score, TextBlock Block)> Classify(Page page, IWordExtractor wordExtractor, IPageSegmenter pageSegmenter)
        {
            var words  = wordExtractor.GetWords(page.Letters);
            var blocks = pageSegmenter.GetBlocks(words);

            foreach (var block in blocks)
            {
                var letters  = block.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters);
                var paths    = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths);
                var images   = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages());
                var features = FeatureHelper.GetFeatures(page, block.BoundingBox, letters, paths, images);

                BlockFeatures blockFeatures = new BlockFeatures()
                {
                    CharsCount           = features[0],
                    PctNumericChars      = features[1],
                    PctAlphabeticalChars = features[2],
                    PctSymbolicChars     = features[3],
                    PctBulletChars       = features[4],
                    DeltaToHeight        = features[5],
                    PathsCount           = features[6],
                    PctBezierPaths       = features[7],
                    PctHorPaths          = features[8],
                    PctVertPaths         = features[9],
                    PctOblPaths          = features[10],
                    ImagesCount          = features[11],
                    ImageAvgProportion   = features[12]
                };

                var result = predEngine.Predict(blockFeatures);

                yield return(FeatureHelper.Categories[(int)result.Prediction], result.Score.Max(), block);
            }
        }
コード例 #2
0
        public IEnumerable <(string Prediction, float Score, TextBlock Block)> Classify(Page page, IWordExtractor wordExtractor,
                                                                                        IPageSegmenter pageSegmenter, Bookmarks bookmarks = null)
        {
            List <DocumentBookmarkNode> bookmarksNodes = bookmarks?.GetNodes()
                                                         .Where(b => b is DocumentBookmarkNode)
                                                         .Select(b => b as DocumentBookmarkNode)
                                                         .Cast <DocumentBookmarkNode>()
                                                         .Where(b => b.PageNumber == page.Number).ToList();

            var avgPageFontHeight = page.Letters.Select(l => l.GlyphRectangle.Height).Average();

            var words  = wordExtractor.GetWords(page.Letters);
            var blocks = pageSegmenter.GetBlocks(words);

            foreach (var block in blocks)
            {
                var letters = block.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters);
                var paths   = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths);
                var images  = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages());

                var features = FeatureHelper.GetFeatures(
                    block, paths,
                    images, avgPageFontHeight,
                    block.BoundingBox.Area,
                    bookmarksNodes);

                BlockFeatures blockFeatures = new BlockFeatures()
                {
                    BlockAspectRatio     = features[0],
                    CharsCount           = features[1],
                    WordsCount           = features[2],
                    LinesCount           = features[3],
                    PctNumericChars      = features[4],
                    PctAlphabeticalChars = features[5],
                    PctSymbolicChars     = features[6],
                    PctBulletChars       = features[7],
                    DeltaToHeight        = features[8],
                    PathsCount           = features[9],
                    PctBezierPaths       = features[10],
                    PctHorPaths          = features[11],
                    PctVertPaths         = features[12],
                    PctOblPaths          = features[13],
                    ImagesCount          = features[14],
                    ImageAvgProportion   = features[15],
                    BestNormEditDistance = features[16],
                };

                var result = predEngine.Predict(blockFeatures);

                yield return(FeatureHelper.Categories[(int)result.Prediction], result.Score.Max(), block);
            }
        }
コード例 #3
0
        public (string Prediction, float Score) Classify(TextBlock textBlock, IEnumerable <PdfPath> paths, IEnumerable <IPdfImage> images,
                                                         double averagePageFontHeight, List <DocumentBookmarkNode> pageBookmarksNodes)
        {
            double bboxArea = textBlock.BoundingBox.Area;

            var letters = textBlock.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters);


            var features = FeatureHelper.GetFeatures(
                textBlock, paths,
                images, averagePageFontHeight,
                textBlock.BoundingBox.Area,
                pageBookmarksNodes);

            BlockFeatures blockFeatures = new BlockFeatures()
            {
                BlockAspectRatio     = features[0],
                CharsCount           = features[1],
                WordsCount           = features[2],
                LinesCount           = features[3],
                PctNumericChars      = features[4],
                PctAlphabeticalChars = features[5],
                PctSymbolicChars     = features[6],
                PctBulletChars       = features[7],
                DeltaToHeight        = features[8],
                PathsCount           = features[9],
                PctBezierPaths       = features[10],
                PctHorPaths          = features[11],
                PctVertPaths         = features[12],
                PctOblPaths          = features[13],
                ImagesCount          = features[14],
                ImageAvgProportion   = features[15],
                BestNormEditDistance = features[16],
            };
            var result = predEngine.Predict(blockFeatures);

            return(FeatureHelper.Categories[(int)result.Prediction], result.Score.Max());
        }