private async static Task <ProcessingResult> ProcessInputBlobForScanningAsync( CloudBlockBlob blob, string entityId, string indexingContainer, TraceWriter log) { ProcessingResult result = new ProcessingResult(); string processedBlobName = Path.GetFileNameWithoutExtension(blob.Name); try { blob.FetchAttributes(); var storageHelper = new StorageHelper(); var extension = Path.GetExtension(blob.Name).ToLower(); var builder = new StringBuilder(); using (var stream = new MemoryStream()) { await blob.DownloadToStreamAsync(stream); try { // currently only process certain types of documents // if not a processable type of document, we just pass through to indexing location switch (extension) { case ".pdf": var bytes = stream.ToArray(); // if there is any text within document add to builder DocumentProcessor.GetTextFromPdf(bytes, out builder); // extract all images within document that are greater than 50x50 pixels List <Stream> images = DocumentProcessor.ExtractImagesFromPDF(bytes, log); if (images.Count > 0) { int imageCounter = 0; foreach (Stream img in images) { imageCounter++; try { builder.Append(" " + DocumentProcessor.ScanImageToString(img)); log.Info($"OCR completed successfully for pdf image #{imageCounter}"); // Azure Vision service has a cap on images processed per second // let's slow it down await Task.Delay(1000); } catch (ArgumentException aex) { // stream isn't a valid image log.Warning($"Failed to open image #{imageCounter} of {images.Count} for {blob.Name}. Error:{aex.Message}"); continue; } catch (Exception ex) { log.Warning($"Failed to OCR scan pfd image #{imageCounter} of {images.Count} for {blob.Name}. Error:{ex.Message}"); // Vision API can throw ClientException, grab inner exception for details if (ex.InnerException != null && ex.InnerException is ClientException) { log.Warning($"InnerException Details: Message={((ClientException)ex.InnerException).Error.Message}"); } } } } break; case ".docx": builder.Append(OfficeHelper.GetAllTextFromWordDoc(stream, log)); break; case ".xlsx": builder.Append(OfficeHelper.GetAllTextFromExcelDoc(stream, log)); break; default: // document is not a proccessable document type. just send through for indexing result.Status = ProcessingStatus.Success; result.DocumentLocation = await MarkAndSendDocumentAsync( entityId, blob, indexingContainer, processedBlobName, log); return(result); } if (builder.Length == 0) { throw new ApplicationException("Text could not be extracted from Document. Can't create empty document"); } // we always create a new pdf doc for indexing with all existing text merged with image text using (var textStream = await DocumentProcessor.CreateTextDocumentAsync(builder.ToString())) { log.Info($"Indexable document created successfully!"); result.Status = ProcessingStatus.Success; result.DocumentLocation = await MarkAndSendDocumentAsync( entityId, textStream, indexingContainer, processedBlobName, log); return(result); } } catch (ApplicationException aex) { var errorMsg = $"Document failed to get processed. Passing document along to indexing location"; log.Warning(errorMsg); // something went wrong processing document, just send through to get indexed result.Status = ProcessingStatus.Warning; result.Message = $"{errorMsg}. Error:{aex.Message}"; result.DocumentLocation = await MarkAndSendDocumentAsync( entityId, blob, indexingContainer, processedBlobName, log); return(result); } } } catch (Exception ex) { result.Status = ProcessingStatus.Failure; result.DocumentLocation = null; result.Message = $"Failed to process document {blob.Name} due to the following error: {ex.Message}{Environment.NewLine}{ex.StackTrace}"; log.Error(result.Message); return(result); } }