internal static bool CreateHocrFileFromPDF(PDDocument document, string outputfile, bool useWords) { bool flag; try { PDFHocr pDFHocr = new PDFHocr() { getHOCRByWords = useWords }; pDFHocr.setSortByPosition(true); pDFHocr.setStartPage(0); pDFHocr.setEndPage(document.getNumberOfPages()); PDFHelper.DisplayTrialPopupIfNecessary(); if (PDFHelper.AddStamp) { pDFHocr.setEndPage(3); } pDFHocr.writeText(document, new OutputStreamWriter(new ByteArrayOutputStream())); if ((pDFHocr.lineList == null ? false : pDFHocr.lineList.Count > 0)) { HocrPageModel hocrPageModel = new HocrPageModel(); hocrPageModel.Lines.AddRange(pDFHocr.SortLineList(pDFHocr.lineList)); pDFHocr.pageList.Add(hocrPageModel); pDFHocr.lineList.Clear(); } pDFHocr.GetHocrFromPageList(pDFHocr.pageList, outputfile); flag = true; } catch (Exception exception) { flag = false; } return(flag); }
internal static List <HocrPageModel> GetPageWordDetails(PDDocument document) { List <HocrPageModel> hocrPageModels; try { PDFHocr pDFHocr = new PDFHocr(); pDFHocr.setSortByPosition(true); pDFHocr.setStartPage(0); pDFHocr.setEndPage(document.getNumberOfPages()); Writer outputStreamWriter = new OutputStreamWriter(new ByteArrayOutputStream()); PDFHelper.DisplayTrialPopupIfNecessary(); if (PDFHelper.AddStamp) { pDFHocr.setEndPage(3); } pDFHocr.writeText(document, outputStreamWriter); if ((pDFHocr.lineList == null ? false : pDFHocr.lineList.Count > 0)) { HocrPageModel hocrPageModel = new HocrPageModel(); hocrPageModel.Lines.AddRange(pDFHocr.SortLineList(pDFHocr.lineList)); pDFHocr.pageList.Add(hocrPageModel); pDFHocr.lineList.Clear(); } hocrPageModels = pDFHocr.pageList; } catch (Exception exception) { hocrPageModels = null; } return(hocrPageModels); }
private bool CreateHocrPage(HocrPageModel page, string fileName, int pageNumber) { bool flag = true; try { List <string> strs = new List <string>(); foreach (HocrLineModel line in page.Lines) { strs.Add(this.GetLineWithWords(line)); } using (StreamWriter streamWriter = new StreamWriter(fileName)) { streamWriter.WriteLine("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\" > "); streamWriter.WriteLine("<html xmlns=\"http://www.w3.org/1999/xhtml\">"); streamWriter.WriteLine("\t<head>"); streamWriter.WriteLine("\t\t <title>OCR Output</title>"); streamWriter.WriteLine("\t</head>"); streamWriter.WriteLine("\t<body>"); streamWriter.WriteLine(string.Format("\t\t <div class=\"ocr_page\" title=\"bbox 0 0 {0} {1}; ppageno {2}\">", page.Lines.Max <HocrLineModel>((HocrLineModel l) => l.Words.Max <WordData>((WordData w) => w.XCord1)), page.Lines.Max <HocrLineModel>((HocrLineModel l) => l.Words.Max <WordData>((WordData w) => w.YCord1)), pageNumber)); foreach (string str in strs) { streamWriter.WriteLine(str); } streamWriter.WriteLine("\t\t</div>"); streamWriter.WriteLine("\t</body>"); streamWriter.WriteLine("</html>"); } } catch (Exception exception) { flag = false; } return(flag); }
private void AddToPageList() { if ((this.lineList == null ? false : this.lineList.Count > 0)) { HocrPageModel hocrPageModel = new HocrPageModel(); hocrPageModel.Lines.AddRange(this.SortLineList(this.lineList)); this.pageList.Add(hocrPageModel); this.lineList.Clear(); } }