C# (CSharp) DocumentProcessor.ExtractImagesFromPDF Beispiele

Programmiersprache: C# (CSharp)

Klasse / Typ: DocumentProcessor

Methode / Funktion: ExtractImagesFromPDF

Beispiele auf hotexamples.com: 1

C# (CSharp) DocumentProcessor.ExtractImagesFromPDF - 1 Beispiele gefunden. Dies sind die am besten bewerteten C# (CSharp) Beispiele für die DocumentProcessor.ExtractImagesFromPDF, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

RunAsync(4)

Process(4)

GetListOfFields(2)

CreateBitmap(2)

Read(2)

ProcessDocument(2)

GetDocumentXml(2)

ReplaceFields(1)

Replace(1)

NormalizeCode(1)

MergeDocuments(1)

GetTextFromPdf(1)

Analyze(1)

GetPartFilter(1)

BasicStats(1)

GenerateHelp(1)

ExtractImagesFromPDF(1)

EvaluateDocument(1)

DeleteDocument(1)

DAIRparser(1)

CreateTextDocumentAsync(1)

BuildDocument(1)

GetDocumentsForUser(1)

Beispiel #1

Datei anzeigen

        private async static Task <ProcessingResult> ProcessInputBlobForScanningAsync(
            CloudBlockBlob blob,
            string entityId,
            string indexingContainer,
            TraceWriter log)
        {
            ProcessingResult result            = new ProcessingResult();
            string           processedBlobName = Path.GetFileNameWithoutExtension(blob.Name);

            try
            {
                blob.FetchAttributes();

                var storageHelper = new StorageHelper();
                var extension     = Path.GetExtension(blob.Name).ToLower();
                var builder       = new StringBuilder();

                using (var stream = new MemoryStream())
                {
                    await blob.DownloadToStreamAsync(stream);

                    try
                    {
                        // currently only process certain types of documents
                        // if not a processable type of document, we just pass through to indexing location
                        switch (extension)
                        {
                        case ".pdf":

                            var bytes = stream.ToArray();

                            // if there is any text within document add to builder
                            DocumentProcessor.GetTextFromPdf(bytes, out builder);

                            // extract all images within document that are greater than 50x50 pixels
                            List <Stream> images = DocumentProcessor.ExtractImagesFromPDF(bytes, log);

                            if (images.Count > 0)
                            {
                                int imageCounter = 0;

                                foreach (Stream img in images)
                                {
                                    imageCounter++;

                                    try
                                    {
                                        builder.Append(" " + DocumentProcessor.ScanImageToString(img));
                                        log.Info($"OCR completed successfully for pdf image #{imageCounter}");

                                        // Azure Vision service has a cap on images processed per second
                                        // let's slow it down
                                        await Task.Delay(1000);
                                    }
                                    catch (ArgumentException aex)
                                    {
                                        // stream isn't a valid image
                                        log.Warning($"Failed to open image #{imageCounter} of {images.Count} for {blob.Name}. Error:{aex.Message}");
                                        continue;
                                    }
                                    catch (Exception ex)
                                    {
                                        log.Warning($"Failed to OCR scan pfd image #{imageCounter} of {images.Count} for {blob.Name}. Error:{ex.Message}");

                                        // Vision API can throw ClientException, grab inner exception for details
                                        if (ex.InnerException != null && ex.InnerException is ClientException)
                                        {
                                            log.Warning($"InnerException Details: Message={((ClientException)ex.InnerException).Error.Message}");
                                        }
                                    }
                                }
                            }

                            break;

                        case ".docx":

                            builder.Append(OfficeHelper.GetAllTextFromWordDoc(stream, log));
                            break;

                        case ".xlsx":

                            builder.Append(OfficeHelper.GetAllTextFromExcelDoc(stream, log));
                            break;

                        default:

                            // document is not a proccessable document type.  just send through for indexing
                            result.Status           = ProcessingStatus.Success;
                            result.DocumentLocation = await MarkAndSendDocumentAsync(
                                entityId,
                                blob,
                                indexingContainer,
                                processedBlobName,
                                log);

                            return(result);
                        }

                        if (builder.Length == 0)
                        {
                            throw new ApplicationException("Text could not be extracted from Document.  Can't create empty document");
                        }

                        // we always create a new pdf doc for indexing with all existing text merged with image text
                        using (var textStream = await DocumentProcessor.CreateTextDocumentAsync(builder.ToString()))
                        {
                            log.Info($"Indexable document created successfully!");

                            result.Status           = ProcessingStatus.Success;
                            result.DocumentLocation = await MarkAndSendDocumentAsync(
                                entityId,
                                textStream,
                                indexingContainer,
                                processedBlobName,
                                log);

                            return(result);
                        }
                    }
                    catch (ApplicationException aex)
                    {
                        var errorMsg = $"Document failed to get processed.  Passing document along to indexing location";
                        log.Warning(errorMsg);

                        // something went wrong processing document, just send through to get indexed
                        result.Status           = ProcessingStatus.Warning;
                        result.Message          = $"{errorMsg}. Error:{aex.Message}";
                        result.DocumentLocation = await MarkAndSendDocumentAsync(
                            entityId,
                            blob,
                            indexingContainer,
                            processedBlobName,
                            log);

                        return(result);
                    }
                }
            }
            catch (Exception ex)
            {
                result.Status           = ProcessingStatus.Failure;
                result.DocumentLocation = null;
                result.Message          = $"Failed to process document {blob.Name} due to the following error: {ex.Message}{Environment.NewLine}{ex.StackTrace}";
                log.Error(result.Message);
                return(result);
            }
        }