public string GetCode(PdfDocument document) { string xmlHeader = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"; xmlHeader += "\n<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n\t\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"; string html = "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n"; string head = _indent + "<head>" + "\n" + _indent + _indent + "<title></title>" + "\n" + _indent + _indent + "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />" + "\n" + _indent + _indent + "<meta name='ocr-system' content='" + _pageSegmenter.GetType().Name + "|" + _wordExtractor.GetType().Name + "' />" + "\n" + _indent + _indent + "<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word' />" + "\n" + _indent + "</head>\n"; string hocr = head + _indent + "<body>\n"; for (var i = 0; i < document.NumberOfPages; i++) { var page = document.GetPage(i + 1); hocr += GetCode(page, GetPageImagePath(_documentPath, i + 1)) + "\n"; } hocr = hocr + _indent + "<script src='https://unpkg.com/hocrjs'></script>\n" + _indent + "</body>"; hocr = xmlHeader + html + hocr + "\n</html>"; return(hocr); }
/// <summary> /// Get the PAGE-XML (XML) string of the pages layout. /// </summary> /// <param name="page"></param> /// <param name="includePaths">Draw PdfPaths present in the page.</param> public string Get(Page page, bool includePaths) { lineCount = 0; wordCount = 0; glyphCount = 0; regionCount = 0; groupOrderCount = 0; orderedRegions = new List <PageXmlDocument.PageXmlRegionRefIndexed>(); PageXmlDocument pageXmlDocument = new PageXmlDocument() { Metadata = new PageXmlDocument.PageXmlMetadata() { Created = DateTime.UtcNow, LastChange = DateTime.UtcNow, Creator = "PdfPig", Comments = pageSegmenter.GetType().Name + "|" + wordExtractor.GetType().Name, }, PcGtsId = "pc-" + page.GetHashCode() }; pageXmlDocument.Page = ToPageXmlPage(page, includePaths); return(Serialize(pageXmlDocument)); }
private AltoDocument.AltoDescription GetAltoDescription(string fileName) { var processing = new AltoDocument.AltoDescriptionProcessing { ProcessingAgency = null, ProcessingCategory = AltoDocument.AltoProcessingCategory.Other, ProcessingDateTime = DateTime.UtcNow.ToString(CultureInfo.InvariantCulture), ProcessingSoftware = new AltoDocument.AltoProcessingSoftware { SoftwareName = "PdfPig", SoftwareCreator = @"https://github.com/UglyToad/PdfPig", ApplicationDescription = "Read and extract text and other content from PDFs in C# (port of PdfBox)", SoftwareVersion = "x.x.xx" }, ProcessingStepDescription = null, ProcessingStepSettings = pageSegmenter.GetType().Name + "|" + wordExtractor.GetType().Name, Id = "P" + pageCount + "_D1" }; var documentIdentifier = new AltoDocument.AltoDocumentIdentifier { DocumentIdentifierLocation = null, Value = null }; var fileIdentifier = new AltoDocument.AltoFileIdentifier { FileIdentifierLocation = null, Value = null }; return(new AltoDocument.AltoDescription { MeasurementUnit = AltoDocument.AltoMeasurementUnit.Pixel, Processings = new[] { processing }, SourceImageInformation = new AltoDocument.AltoSourceImageInformation { DocumentIdentifiers = new [] { documentIdentifier }, FileIdentifiers = new [] { fileIdentifier }, FileName = fileName } }); }
private string GetHead() { return(indentChar + "<head>" + "\n" + indentChar + indentChar + "<title></title>" + "\n" + indentChar + indentChar + "<meta http-equiv='Content-Type' content='text/html;charset=utf-8' />" + "\n" + indentChar + indentChar + "<meta name='ocr-system' content='" + pageSegmenter.GetType().Name + "|" + wordExtractor.GetType().Name + "' />" + "\n" + indentChar + indentChar + "<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocr_linedrawing' />" + "\n" + indentChar + "</head>\n"); }