public async Task <string> GetText(string tessdataLocation = null) { if (!_loaded) { await Load(); } var attempt = GetTextFromSimplePdf(); if (!string.IsNullOrEmpty(attempt) && !string.IsNullOrEmpty(attempt.Trim())) { return(attempt); } var tiff = await GetImage(ImageFormat.Tiff); try { return(await Tesseract.GetTextFromTiff(tiff, tessdataLocation)); // ocr tiff } catch (IOException) { tiff = ImageManipulation.ChangeFormat(tiff, ImageFormat.Tiff); // re-saves the image return(await Tesseract.GetTextFromTiff(tiff, tessdataLocation)); // ocr tiff } }
private async Task <byte[]> GetImage(ImageFormat format, bool readAsOneImage = true) { if (!_loaded) { await Load(); } return(await Task.Run(() => { _stream.Position = 0; if (readAsOneImage) { var settings = new MagickReadSettings { Density = new Density(300) }; using (var images = new MagickImageCollection()) { images.Read(_stream, settings); using (var vertical = images.AppendVertically()) { var path = $"pdf.{GetFilenameExtension(format)}"; vertical.Write(path); // todo handle with stream instead of file return File.ReadAllBytes(path); } } } else { var settings = new MagickReadSettings { Density = new Density(300, 300) }; var pages = new List <byte[]>(); using (var images = new MagickImageCollection()) { images.Read(_stream, settings); var page = 1; foreach (var image in images.OfType <MagickImage>()) { var path = $"pdf.{page}.{GetFilenameExtension(format)}"; image.Write(path); pages.Add(File.ReadAllBytes(path)); page++; } } return ImageManipulation.CombineImages(pages, format); } })); }