Пример #1
0
        /// <summary>
        /// Get the PAGE-XML (XML) string of the pages layout.
        /// </summary>
        /// <param name="page"></param>
        /// <param name="includePaths">Draw PdfPaths present in the page.</param>
        public string Get(Page page, bool includePaths)
        {
            lineCount       = 0;
            wordCount       = 0;
            glyphCount      = 0;
            regionCount     = 0;
            groupOrderCount = 0;
            orderedRegions  = new List <PageXmlDocument.PageXmlRegionRefIndexed>();

            PageXmlDocument pageXmlDocument = new PageXmlDocument()
            {
                Metadata = new PageXmlDocument.PageXmlMetadata()
                {
                    Created    = DateTime.UtcNow,
                    LastChange = DateTime.UtcNow,
                    Creator    = "PdfPig",
                    Comments   = pageSegmenter.GetType().Name + "|" + wordExtractor.GetType().Name,
                },
                PcGtsId = "pc-" + page.GetHashCode()
            };

            pageXmlDocument.Page = ToPageXmlPage(page, includePaths);

            return(Serialize(pageXmlDocument));
        }
Пример #2
0
        private AltoDocument.AltoDescription GetAltoDescription(string fileName)
        {
            var processing = new AltoDocument.AltoDescriptionProcessing
            {
                ProcessingAgency   = null,
                ProcessingCategory = AltoDocument.AltoProcessingCategory.Other,
                ProcessingDateTime = DateTime.UtcNow.ToString(CultureInfo.InvariantCulture),
                ProcessingSoftware = new AltoDocument.AltoProcessingSoftware
                {
                    SoftwareName           = "PdfPig",
                    SoftwareCreator        = @"https://github.com/UglyToad/PdfPig",
                    ApplicationDescription = "Read and extract text and other content from PDFs in C# (port of PdfBox)",
                    SoftwareVersion        = "x.x.xx"
                },
                ProcessingStepDescription = null,
                ProcessingStepSettings    = pageSegmenter.GetType().Name + "|" + wordExtractor.GetType().Name,
                Id = "P" + pageCount + "_D1"
            };

            var documentIdentifier = new AltoDocument.AltoDocumentIdentifier
            {
                DocumentIdentifierLocation = null,
                Value = null
            };

            var fileIdentifier = new AltoDocument.AltoFileIdentifier
            {
                FileIdentifierLocation = null,
                Value = null
            };

            return(new AltoDocument.AltoDescription
            {
                MeasurementUnit = AltoDocument.AltoMeasurementUnit.Pixel,
                Processings = new[] { processing },
                SourceImageInformation = new AltoDocument.AltoSourceImageInformation
                {
                    DocumentIdentifiers = new [] { documentIdentifier },
                    FileIdentifiers = new [] { fileIdentifier },
                    FileName = fileName
                }
            });
        }
Пример #3
0
 private string GetHead()
 {
     return(indentChar + "<head>" +
            "\n" + indentChar + indentChar + "<title></title>" +
            "\n" + indentChar + indentChar + "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />" +
            "\n" + indentChar + indentChar + "<meta name='ocr-system' content='" + pageSegmenter.GetType().Name + "|" + wordExtractor.GetType().Name + "' />" +
            "\n" + indentChar + indentChar + "<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocr_linedrawing' />" +
            "\n" + indentChar + "</head>\n");
 }
Пример #4
0
        public string GetCode(PdfDocument document)
        {
            string xmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";

            xmlHeader += "\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n";

            string html = "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n";
            string head =
                _indent + "<head>" +
                "\n" + _indent + _indent + "<title></title>" +
                "\n" + _indent + _indent + "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />" +
                "\n" + _indent + _indent + "<meta name='ocr-system' content='" + _pageSegmenter.GetType().Name + "|" + _wordExtractor.GetType().Name + "' />" +
                "\n" + _indent + _indent + "<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word' />" +
                "\n" + _indent + "</head>\n";

            string hocr = head + _indent + "<body>\n";

            for (var i = 0; i < document.NumberOfPages; i++)
            {
                var page = document.GetPage(i + 1);
                hocr += GetCode(page, GetPageImagePath(_documentPath, i + 1)) + "\n";
            }

            hocr = hocr + _indent + "<script src='https://unpkg.com/hocrjs'></script>\n" + _indent + "</body>";
            hocr = xmlHeader + html + hocr + "\n</html>";
            return(hocr);
        }