示例#1
0
        public IEnumerable <(string Prediction, float Score, TextBlock Block)> Classify(Page page, IWordExtractor wordExtractor, IPageSegmenter pageSegmenter)
        {
            var words  = wordExtractor.GetWords(page.Letters);
            var blocks = pageSegmenter.GetBlocks(words);

            foreach (var block in blocks)
            {
                var letters  = block.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters);
                var paths    = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths);
                var images   = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages());
                var features = FeatureHelper.GetFeatures(page, block.BoundingBox, letters, paths, images);

                BlockFeatures blockFeatures = new BlockFeatures()
                {
                    CharsCount           = features[0],
                    PctNumericChars      = features[1],
                    PctAlphabeticalChars = features[2],
                    PctSymbolicChars     = features[3],
                    PctBulletChars       = features[4],
                    DeltaToHeight        = features[5],
                    PathsCount           = features[6],
                    PctBezierPaths       = features[7],
                    PctHorPaths          = features[8],
                    PctVertPaths         = features[9],
                    PctOblPaths          = features[10],
                    ImagesCount          = features[11],
                    ImageAvgProportion   = features[12]
                };

                var result = predEngine.Predict(blockFeatures);

                yield return(FeatureHelper.Categories[(int)result.Prediction], result.Score.Max(), block);
            }
        }
示例#2
0
        static void Main(string[] args)
        {
            // 1. Convert pdf documents and their PAGE xml ground truth to csv files
            //DataGenerator.GetCsv(TEST_RAW_DATA_FILEPATH, 0, TEST_DATA_FILENAME);        // testing
            //DataGenerator.GetCsv(TRAIN_RAW_DATA_FILEPATH, 0, TRAIN_DATA_FILENAME);      // training

            // 2. Create the model
            //LightGbmModelBuilder.TrainModel(DataGenerator.GetDataPath(TRAIN_DATA_FILENAME), MODEL_NAME);

            // 3. Evaluate the model
            //LightGbmModelBuilder.Evaluate(MODEL_NAME, DataGenerator.GetDataPath(TEST_DATA_FILENAME));

            // 4. Load the trained classifier
            LightGbmBlockClassifier lightGbmBlockClassifier = new LightGbmBlockClassifier(LightGbmModelBuilder.GetModelPath(MODEL_NAME));

            var test = lightGbmBlockClassifier.OutputSchema["label"].HasSlotNames();

            NearestNeighbourWordExtractor nearestNeighbourWordExtractor = new NearestNeighbourWordExtractor();
            RecursiveXYCut recursiveXYCut = new RecursiveXYCut();

            using (var document = PdfDocument.Open("sample.pdf"))
            {
                var hasBookmarks = document.TryGetBookmarks(out Bookmarks bookmarks);

                for (var i = 0; i < document.NumberOfPages; i++)
                {
                    var page = document.GetPage(i + 1);

                    List <DocumentBookmarkNode> bookmarksNodes = bookmarks?.GetNodes()
                                                                 .Where(b => b is DocumentBookmarkNode)
                                                                 .Select(b => b as DocumentBookmarkNode)
                                                                 .Cast <DocumentBookmarkNode>()
                                                                 .Where(b => b.PageNumber == page.Number).ToList();

                    var avgPageFontHeight = page.Letters.Select(l => l.GlyphRectangle.Height).Average();

                    var words  = nearestNeighbourWordExtractor.GetWords(page.Letters);
                    var blocks = recursiveXYCut.GetBlocks(words, page.Width / 3.0);

                    foreach (var block in blocks)
                    {
                        var paths  = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths);
                        var images = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages());

                        var pred = lightGbmBlockClassifier.Classify(block, paths, images, avgPageFontHeight, bookmarksNodes);

                        Console.WriteLine();
                        Console.WriteLine(pred.Prediction + " [" + pred.Score.ToString("0.0%") + "]");
                        Console.WriteLine(block.Text.Normalize(normalizationForm: System.Text.NormalizationForm.FormKC)); // remove ligatures
                    }
                }
            }

            Console.ReadKey();
        }
示例#3
0
        public IEnumerable <(string Prediction, float Score, TextBlock Block)> Classify(Page page, IWordExtractor wordExtractor,
                                                                                        IPageSegmenter pageSegmenter, Bookmarks bookmarks = null)
        {
            List <DocumentBookmarkNode> bookmarksNodes = bookmarks?.GetNodes()
                                                         .Where(b => b is DocumentBookmarkNode)
                                                         .Select(b => b as DocumentBookmarkNode)
                                                         .Cast <DocumentBookmarkNode>()
                                                         .Where(b => b.PageNumber == page.Number).ToList();

            var avgPageFontHeight = page.Letters.Select(l => l.GlyphRectangle.Height).Average();

            var words  = wordExtractor.GetWords(page.Letters);
            var blocks = pageSegmenter.GetBlocks(words);

            foreach (var block in blocks)
            {
                var letters = block.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters);
                var paths   = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths);
                var images  = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages());

                var features = FeatureHelper.GetFeatures(
                    block, paths,
                    images, avgPageFontHeight,
                    block.BoundingBox.Area,
                    bookmarksNodes);

                BlockFeatures blockFeatures = new BlockFeatures()
                {
                    BlockAspectRatio     = features[0],
                    CharsCount           = features[1],
                    WordsCount           = features[2],
                    LinesCount           = features[3],
                    PctNumericChars      = features[4],
                    PctAlphabeticalChars = features[5],
                    PctSymbolicChars     = features[6],
                    PctBulletChars       = features[7],
                    DeltaToHeight        = features[8],
                    PathsCount           = features[9],
                    PctBezierPaths       = features[10],
                    PctHorPaths          = features[11],
                    PctVertPaths         = features[12],
                    PctOblPaths          = features[13],
                    ImagesCount          = features[14],
                    ImageAvgProportion   = features[15],
                    BestNormEditDistance = features[16],
                };

                var result = predEngine.Predict(blockFeatures);

                yield return(FeatureHelper.Categories[(int)result.Prediction], result.Score.Max(), block);
            }
        }