public static void TestClassification(string trainingFolder, string pdfPath) { var svm = Serializer.Load <MulticlassSupportVectorMachine <Gaussian> >(Path.Combine(trainingFolder, "model.gz"), SerializerCompression.GZip); using (var document = PdfDocument.Open(pdfPath)) { for (var i = 0; i < document.NumberOfPages; i++) { var page = document.GetPage(i + 1); var words = page.GetWords(); if (words.Count() == 0) { continue; } var blocks = DocstrumBoundingBoxes.Instance.GetBlocks(words); foreach (var block in blocks) { var letters = block.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters); var paths = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths); var images = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages()); var features = FeatureHelper.GetFeatures(page, block.BoundingBox, letters, paths, images); var category = svm.Decide(features); Console.WriteLine(FeatureHelper.Categories[category]); Console.WriteLine(block.Text); Console.WriteLine(); } Console.ReadKey(); } } }
/// <summary> /// Generate a csv file of features. You will need the pdf documents and the ground truths in PAGE xml format. /// </summary> /// <param name="trainingFolder">The path to the training folder. Should contain both the pdf files and their corresponding ground truth xml files.</param> /// <param name="numberOfPdfDocs">Number of documents to concider.</param> public static void GenerateCsv(string trainingFolder, int numberOfPdfDocs) { List <double[]> features = new List <double[]>(); List <int> categories = new List <int>(); int done = 0; DirectoryInfo d = new DirectoryInfo(trainingFolder); var pdfFileLinks = d.GetFiles("*.pdf"); var indexesSelected = GenerateRandom(numberOfPdfDocs, 0, pdfFileLinks.Length); foreach (int index in indexesSelected) { var pdfFile = pdfFileLinks[index]; string fileName = pdfFile.Name; string xmlFileNameTemplate = fileName.Replace(".pdf", "_*.xml"); var pageXmlLinks = d.GetFiles(xmlFileNameTemplate); if (pageXmlLinks.Length == 0) { Console.BackgroundColor = ConsoleColor.Red; Console.WriteLine("No PageXml file found for document '" + fileName + "'"); Console.ResetColor(); continue; } try { using (var doc = PdfDocument.Open(pdfFile.FullName)) { foreach (var pageXmlLink in pageXmlLinks) { var pageXml = Deserialize(pageXmlLink.FullName); int pageNo = ParseXmlFileName(pageXmlLink.Name); var page = doc.GetPage(pageNo + 1); var blocks = pageXml.Page.Items; foreach (var block in blocks) { int category = -1; PdfRectangle bbox = new PdfRectangle(); if (block is PageXmlTextRegion textBlock) { bbox = ParsePageXmlCoord(textBlock.Coords.Points, (double)page.Height); switch (textBlock.Type) { case PageXmlTextSimpleType.Heading: category = 0; break; case PageXmlTextSimpleType.Paragraph: category = 1; break; case PageXmlTextSimpleType.LisLabel: category = 2; break; default: throw new ArgumentException("Unknown category"); } } else if (block is PageXmlTableRegion tableBlock) { bbox = ParsePageXmlCoord(tableBlock.Coords.Points, (double)page.Height); category = 3; } else if (block is PageXmlImageRegion imageBlock) { bbox = ParsePageXmlCoord(imageBlock.Coords.Points, (double)page.Height); category = 4; } else { throw new ArgumentException("Unknown region type"); } var letters = FeatureHelper.GetLettersInside(bbox, page.Letters).ToList(); var paths = FeatureHelper.GetPathsInside(bbox, page.ExperimentalAccess.Paths).ToList(); var images = FeatureHelper.GetImagesInside(bbox, page.GetImages()); var f = FeatureHelper.GetFeatures(page, bbox, letters, paths, images); if (category == -1) { throw new ArgumentException("Unknown category number."); } if (f != null) { features.Add(f); categories.Add(category); } } } } } catch (Exception ex) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("Error for document '" + fileName + "': " + ex.Message); Console.ResetColor(); } Console.WriteLine(done++); } if (features.Count != categories.Count) { throw new ArgumentException("features and categories don't have the same size"); } string[] csv = features.Zip(categories, (f, c) => string.Join(',', f) + "," + c).ToArray(); File.WriteAllLines(Path.Combine(trainingFolder, "features.csv"), csv); }