public IEnumerable <(string Prediction, float Score, TextBlock Block)> Classify(Page page, IWordExtractor wordExtractor, IPageSegmenter pageSegmenter) { var words = wordExtractor.GetWords(page.Letters); var blocks = pageSegmenter.GetBlocks(words); foreach (var block in blocks) { var letters = block.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters); var paths = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths); var images = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages()); var features = FeatureHelper.GetFeatures(page, block.BoundingBox, letters, paths, images); BlockFeatures blockFeatures = new BlockFeatures() { CharsCount = features[0], PctNumericChars = features[1], PctAlphabeticalChars = features[2], PctSymbolicChars = features[3], PctBulletChars = features[4], DeltaToHeight = features[5], PathsCount = features[6], PctBezierPaths = features[7], PctHorPaths = features[8], PctVertPaths = features[9], PctOblPaths = features[10], ImagesCount = features[11], ImageAvgProportion = features[12] }; var result = predEngine.Predict(blockFeatures); yield return(FeatureHelper.Categories[(int)result.Prediction], result.Score.Max(), block); } }
static void Main(string[] args) { // 1. Convert pdf documents and their PAGE xml ground truth to csv files //DataGenerator.GetCsv(TEST_RAW_DATA_FILEPATH, 0, TEST_DATA_FILENAME); // testing //DataGenerator.GetCsv(TRAIN_RAW_DATA_FILEPATH, 0, TRAIN_DATA_FILENAME); // training // 2. Create the model //LightGbmModelBuilder.TrainModel(DataGenerator.GetDataPath(TRAIN_DATA_FILENAME), MODEL_NAME); // 3. Evaluate the model //LightGbmModelBuilder.Evaluate(MODEL_NAME, DataGenerator.GetDataPath(TEST_DATA_FILENAME)); // 4. Load the trained classifier LightGbmBlockClassifier lightGbmBlockClassifier = new LightGbmBlockClassifier(LightGbmModelBuilder.GetModelPath(MODEL_NAME)); var test = lightGbmBlockClassifier.OutputSchema["label"].HasSlotNames(); NearestNeighbourWordExtractor nearestNeighbourWordExtractor = new NearestNeighbourWordExtractor(); RecursiveXYCut recursiveXYCut = new RecursiveXYCut(); using (var document = PdfDocument.Open("sample.pdf")) { var hasBookmarks = document.TryGetBookmarks(out Bookmarks bookmarks); for (var i = 0; i < document.NumberOfPages; i++) { var page = document.GetPage(i + 1); List <DocumentBookmarkNode> bookmarksNodes = bookmarks?.GetNodes() .Where(b => b is DocumentBookmarkNode) .Select(b => b as DocumentBookmarkNode) .Cast <DocumentBookmarkNode>() .Where(b => b.PageNumber == page.Number).ToList(); var avgPageFontHeight = page.Letters.Select(l => l.GlyphRectangle.Height).Average(); var words = nearestNeighbourWordExtractor.GetWords(page.Letters); var blocks = recursiveXYCut.GetBlocks(words, page.Width / 3.0); foreach (var block in blocks) { var paths = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths); var images = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages()); var pred = lightGbmBlockClassifier.Classify(block, paths, images, avgPageFontHeight, bookmarksNodes); Console.WriteLine(); Console.WriteLine(pred.Prediction + " [" + pred.Score.ToString("0.0%") + "]"); Console.WriteLine(block.Text.Normalize(normalizationForm: System.Text.NormalizationForm.FormKC)); // remove ligatures } } } Console.ReadKey(); }
public IEnumerable <(string Prediction, float Score, TextBlock Block)> Classify(Page page, IWordExtractor wordExtractor, IPageSegmenter pageSegmenter, Bookmarks bookmarks = null) { List <DocumentBookmarkNode> bookmarksNodes = bookmarks?.GetNodes() .Where(b => b is DocumentBookmarkNode) .Select(b => b as DocumentBookmarkNode) .Cast <DocumentBookmarkNode>() .Where(b => b.PageNumber == page.Number).ToList(); var avgPageFontHeight = page.Letters.Select(l => l.GlyphRectangle.Height).Average(); var words = wordExtractor.GetWords(page.Letters); var blocks = pageSegmenter.GetBlocks(words); foreach (var block in blocks) { var letters = block.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters); var paths = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths); var images = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages()); var features = FeatureHelper.GetFeatures( block, paths, images, avgPageFontHeight, block.BoundingBox.Area, bookmarksNodes); BlockFeatures blockFeatures = new BlockFeatures() { BlockAspectRatio = features[0], CharsCount = features[1], WordsCount = features[2], LinesCount = features[3], PctNumericChars = features[4], PctAlphabeticalChars = features[5], PctSymbolicChars = features[6], PctBulletChars = features[7], DeltaToHeight = features[8], PathsCount = features[9], PctBezierPaths = features[10], PctHorPaths = features[11], PctVertPaths = features[12], PctOblPaths = features[13], ImagesCount = features[14], ImageAvgProportion = features[15], BestNormEditDistance = features[16], }; var result = predEngine.Predict(blockFeatures); yield return(FeatureHelper.Categories[(int)result.Prediction], result.Score.Max(), block); } }