static void Main(string[] args) { // 1. Convert pdf documents and their PAGE xml ground truth to csv files //DataGenerator.GetCsv(TEST_RAW_DATA_FILEPATH, 0, TEST_DATA_FILENAME); // testing //DataGenerator.GetCsv(TRAIN_RAW_DATA_FILEPATH, 0, TRAIN_DATA_FILENAME); // training // 2. Create the model //LightGbmModelBuilder.TrainModel(DataGenerator.GetDataPath(TRAIN_DATA_FILENAME), MODEL_NAME); // 3. Evaluate the model //LightGbmModelBuilder.Evaluate(MODEL_NAME, DataGenerator.GetDataPath(TEST_DATA_FILENAME)); // 4. Load the trained classifier LightGbmBlockClassifier lightGbmBlockClassifier = new LightGbmBlockClassifier(LightGbmModelBuilder.GetModelPath(MODEL_NAME)); var test = lightGbmBlockClassifier.OutputSchema["label"].HasSlotNames(); NearestNeighbourWordExtractor nearestNeighbourWordExtractor = new NearestNeighbourWordExtractor(); RecursiveXYCut recursiveXYCut = new RecursiveXYCut(); using (var document = PdfDocument.Open("sample.pdf")) { var hasBookmarks = document.TryGetBookmarks(out Bookmarks bookmarks); for (var i = 0; i < document.NumberOfPages; i++) { var page = document.GetPage(i + 1); List <DocumentBookmarkNode> bookmarksNodes = bookmarks?.GetNodes() .Where(b => b is DocumentBookmarkNode) .Select(b => b as DocumentBookmarkNode) .Cast <DocumentBookmarkNode>() .Where(b => b.PageNumber == page.Number).ToList(); var avgPageFontHeight = page.Letters.Select(l => l.GlyphRectangle.Height).Average(); var words = nearestNeighbourWordExtractor.GetWords(page.Letters); var blocks = recursiveXYCut.GetBlocks(words, page.Width / 3.0); foreach (var block in blocks) { var paths = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths); var images = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages()); var pred = lightGbmBlockClassifier.Classify(block, paths, images, avgPageFontHeight, bookmarksNodes); Console.WriteLine(); Console.WriteLine(pred.Prediction + " [" + pred.Score.ToString("0.0%") + "]"); Console.WriteLine(block.Text.Normalize(normalizationForm: System.Text.NormalizationForm.FormKC)); // remove ligatures } } } Console.ReadKey(); }
public void GetBlocks(string name, string[] expected) { using (var document = PdfDocument.Open(DlaHelper.GetDocumentPath(name))) { var page = document.GetPage(1); var words = NearestNeighbourWordExtractor.Instance.GetWords(page.Letters); var options = new RecursiveXYCut.RecursiveXYCutOptions() { MinimumWidth = page.Width / 3.0, LineSeparator = " " }; var blocks = new RecursiveXYCut(options).GetBlocks(words); Assert.Equal(expected.Length, blocks.Count); var orderedBlocks = blocks.OrderBy(b => b.BoundingBox.BottomLeft.X) .ThenByDescending(b => b.BoundingBox.BottomLeft.Y).ToList(); for (int i = 0; i < orderedBlocks.Count; i++) { Assert.Equal(expected[i], orderedBlocks[i].Text); } } }