public async Task Process(NewItemRequest newItem) { try { var documentResult = new DocumentResultResponse(); var filePath = _fileManager.Download(newItem.DocumentUrl); var numberOfPages = _fileManager.GetNumberOfPages(filePath); for (var page = 1; page <= numberOfPages; page++) { var pagePath = _fileManager.GeneratePage(filePath, page); var text = _pdfToText.Get(pagePath); if (text == null) { var imagePath = _fileManager.GeneratePageInImage(filePath, page); text = await _cognitive.Get(imagePath).ConfigureAwait(false); _fileManager.Delete(imagePath); } documentResult.AddProcessedPage(text, page); _fileManager.Delete(pagePath); } _fileManager.Delete(filePath); documentResult.Success = true; _backgroundJobs.Enqueue(() => _callback.Send(documentResult, newItem)); } catch (Exception ex) { var documentResult = new DocumentResultResponse { Success = false, ErrorMessage = ex.Message }; _backgroundJobs.Enqueue(() => _callback.Send(documentResult, newItem)); } }
public async Task Send(DocumentResultResponse documentResult, NewItemRequest newItem) { using var client = new HttpClient(); var jsonContent = JsonConvert.SerializeObject(documentResult); var contentString = new StringContent(jsonContent, Encoding.UTF8, "application/json"); contentString.Headers.ContentType = new MediaTypeHeaderValue("application/json"); var response = await client.PutAsync(newItem.CallbackUrl, contentString).ConfigureAwait(false); if (response.StatusCode != HttpStatusCode.OK) { throw new Exception("Error to callback"); } }
public virtual async Task Send(DocumentResultResponse documentResult, string callbackUrl) { Console.WriteLine($"Url to response client: {callbackUrl}"); var resultAtJson = JsonConvert.SerializeObject(documentResult); var content = new StringContent(resultAtJson, Encoding.UTF8, "application/json"); content.Headers.ContentType = new MediaTypeHeaderValue("application/json"); var response = await client.PutAsync(callbackUrl, content).ConfigureAwait(false); if (response.StatusCode != HttpStatusCode.OK) { var clientResponse = await response.Content.ReadAsStringAsync().ConfigureAwait(false); throw new Exception($"Error to response client: {clientResponse}"); } }
private async Task RunInParallel(string filePath, int numberOfPages, DocumentResultResponse documentResult) { var numberOfItensInParallel = numberOfPages > EnvironmentVariables.NumberMaxDocumentsInParallel ? EnvironmentVariables.NumberMaxDocumentsInParallel : numberOfPages; var skip = 0; var processed = 0; while (numberOfItensInParallel > 0) { var tasks = documentResult.Pages.Skip(skip).Take(numberOfItensInParallel).Select(page => ProcessPage(page, filePath)); await Task.WhenAll(tasks).ConfigureAwait(false); processed += numberOfItensInParallel; skip += numberOfItensInParallel; numberOfItensInParallel = numberOfPages > processed + numberOfItensInParallel ? numberOfItensInParallel : numberOfPages - processed; } }
private async Task <DocumentResultResponse> ExtractOcr(NewFileToProcess newItem) { try { var filePath = await _fileManager.Download(newItem.Url).ConfigureAwait(false); var text = await $"pdftotext {filePath} -".Bash().ConfigureAwait(false); var numberOfPages = _fileManager.GetNumberOfPages(filePath); var documentResult = new DocumentResultResponse(numberOfPages, newItem.DocumentIdentifier, newItem.AccessKey); await RunInParallel(filePath, numberOfPages, documentResult).ConfigureAwait(false); Console.WriteLine($"Finishing document with URL {newItem.Url}"); _fileManager.Delete(filePath); documentResult.Success = true; return(documentResult); } catch (Exception ex) { return(new DocumentResultResponse(ex.Message, newItem.DocumentIdentifier, newItem.AccessKey)); } }