public static void TestClassification(string trainingFolder, string pdfPath)
        {
            var svm = Serializer.Load <MulticlassSupportVectorMachine <Gaussian> >(Path.Combine(trainingFolder, "model.gz"), SerializerCompression.GZip);

            using (var document = PdfDocument.Open(pdfPath))
            {
                for (var i = 0; i < document.NumberOfPages; i++)
                {
                    var page = document.GetPage(i + 1);

                    var words = page.GetWords();
                    if (words.Count() == 0)
                    {
                        continue;
                    }

                    var blocks = DocstrumBoundingBoxes.Instance.GetBlocks(words);

                    foreach (var block in blocks)
                    {
                        var letters  = block.TextLines.SelectMany(li => li.Words).SelectMany(w => w.Letters);
                        var paths    = FeatureHelper.GetPathsInside(block.BoundingBox, page.ExperimentalAccess.Paths);
                        var images   = FeatureHelper.GetImagesInside(block.BoundingBox, page.GetImages());
                        var features = FeatureHelper.GetFeatures(page, block.BoundingBox, letters, paths, images);

                        var category = svm.Decide(features);

                        Console.WriteLine(FeatureHelper.Categories[category]);
                        Console.WriteLine(block.Text);
                        Console.WriteLine();
                    }

                    Console.ReadKey();
                }
            }
        }
Beispiel #2
0
        /// <summary>
        /// Generate a csv file of features. You will need the pdf documents and the ground truths in PAGE xml format.
        /// </summary>
        /// <param name="trainingFolder">The path to the training folder. Should contain both the pdf files and their corresponding ground truth xml files.</param>
        /// <param name="numberOfPdfDocs">Number of documents to concider.</param>
        public static void GenerateCsv(string trainingFolder, int numberOfPdfDocs)
        {
            List <double[]> features   = new List <double[]>();
            List <int>      categories = new List <int>();

            int done = 0;

            DirectoryInfo d            = new DirectoryInfo(trainingFolder);
            var           pdfFileLinks = d.GetFiles("*.pdf");

            var indexesSelected = GenerateRandom(numberOfPdfDocs, 0, pdfFileLinks.Length);

            foreach (int index in indexesSelected)
            {
                var    pdfFile             = pdfFileLinks[index];
                string fileName            = pdfFile.Name;
                string xmlFileNameTemplate = fileName.Replace(".pdf", "_*.xml");
                var    pageXmlLinks        = d.GetFiles(xmlFileNameTemplate);

                if (pageXmlLinks.Length == 0)
                {
                    Console.BackgroundColor = ConsoleColor.Red;
                    Console.WriteLine("No PageXml file found for document '" + fileName + "'");
                    Console.ResetColor();
                    continue;
                }

                try
                {
                    using (var doc = PdfDocument.Open(pdfFile.FullName))
                    {
                        foreach (var pageXmlLink in pageXmlLinks)
                        {
                            var pageXml = Deserialize(pageXmlLink.FullName);
                            int pageNo  = ParseXmlFileName(pageXmlLink.Name);
                            var page    = doc.GetPage(pageNo + 1);

                            var blocks = pageXml.Page.Items;

                            foreach (var block in blocks)
                            {
                                int          category = -1;
                                PdfRectangle bbox     = new PdfRectangle();

                                if (block is PageXmlTextRegion textBlock)
                                {
                                    bbox = ParsePageXmlCoord(textBlock.Coords.Points, (double)page.Height);
                                    switch (textBlock.Type)
                                    {
                                    case PageXmlTextSimpleType.Heading:
                                        category = 0;
                                        break;

                                    case PageXmlTextSimpleType.Paragraph:
                                        category = 1;
                                        break;

                                    case PageXmlTextSimpleType.LisLabel:
                                        category = 2;
                                        break;

                                    default:
                                        throw new ArgumentException("Unknown category");
                                    }
                                }
                                else if (block is PageXmlTableRegion tableBlock)
                                {
                                    bbox     = ParsePageXmlCoord(tableBlock.Coords.Points, (double)page.Height);
                                    category = 3;
                                }
                                else if (block is PageXmlImageRegion imageBlock)
                                {
                                    bbox     = ParsePageXmlCoord(imageBlock.Coords.Points, (double)page.Height);
                                    category = 4;
                                }
                                else
                                {
                                    throw new ArgumentException("Unknown region type");
                                }

                                var letters = FeatureHelper.GetLettersInside(bbox, page.Letters).ToList();
                                var paths   = FeatureHelper.GetPathsInside(bbox, page.ExperimentalAccess.Paths).ToList();
                                var images  = FeatureHelper.GetImagesInside(bbox, page.GetImages());
                                var f       = FeatureHelper.GetFeatures(page, bbox, letters, paths, images);

                                if (category == -1)
                                {
                                    throw new ArgumentException("Unknown category number.");
                                }

                                if (f != null)
                                {
                                    features.Add(f);
                                    categories.Add(category);
                                }
                            }
                        }
                    }
                }
                catch (Exception ex)
                {
                    Console.ForegroundColor = ConsoleColor.Red;
                    Console.WriteLine("Error for document '" + fileName + "': " + ex.Message);
                    Console.ResetColor();
                }
                Console.WriteLine(done++);
            }

            if (features.Count != categories.Count)
            {
                throw new ArgumentException("features and categories don't have the same size");
            }

            string[] csv = features.Zip(categories, (f, c) => string.Join(',', f) + "," + c).ToArray();
            File.WriteAllLines(Path.Combine(trainingFolder, "features.csv"), csv);
        }