Exemple #1
0
        public async Task <string> GetText(string tessdataLocation = null)
        {
            if (!_loaded)
            {
                await Load();
            }

            var attempt = GetTextFromSimplePdf();

            if (!string.IsNullOrEmpty(attempt) && !string.IsNullOrEmpty(attempt.Trim()))
            {
                return(attempt);
            }

            var tiff = await GetImage(ImageFormat.Tiff);

            try
            {
                return(await Tesseract.GetTextFromTiff(tiff, tessdataLocation)); // ocr tiff
            }
            catch (IOException)
            {
                tiff = ImageManipulation.ChangeFormat(tiff, ImageFormat.Tiff);   // re-saves the image
                return(await Tesseract.GetTextFromTiff(tiff, tessdataLocation)); // ocr tiff
            }
        }
Exemple #2
0
        private async Task <byte[]> GetImage(ImageFormat format, bool readAsOneImage = true)
        {
            if (!_loaded)
            {
                await Load();
            }

            return(await Task.Run(() =>
            {
                _stream.Position = 0;

                if (readAsOneImage)
                {
                    var settings = new MagickReadSettings {
                        Density = new Density(300)
                    };

                    using (var images = new MagickImageCollection())
                    {
                        images.Read(_stream, settings);

                        using (var vertical = images.AppendVertically())
                        {
                            var path = $"pdf.{GetFilenameExtension(format)}";

                            vertical.Write(path); // todo handle with stream instead of file

                            return File.ReadAllBytes(path);
                        }
                    }
                }
                else
                {
                    var settings = new MagickReadSettings {
                        Density = new Density(300, 300)
                    };

                    var pages = new List <byte[]>();

                    using (var images = new MagickImageCollection())
                    {
                        images.Read(_stream, settings);

                        var page = 1;
                        foreach (var image in images.OfType <MagickImage>())
                        {
                            var path = $"pdf.{page}.{GetFilenameExtension(format)}";

                            image.Write(path);
                            pages.Add(File.ReadAllBytes(path));

                            page++;
                        }
                    }

                    return ImageManipulation.CombineImages(pages, format);
                }
            }));
        }