Ejemplo n.º 1
0
        public async Task Process(NewItemRequest newItem)
        {
            try
            {
                var documentResult = new DocumentResultResponse();
                var filePath       = _fileManager.Download(newItem.DocumentUrl);
                var numberOfPages  = _fileManager.GetNumberOfPages(filePath);

                for (var page = 1; page <= numberOfPages; page++)
                {
                    var pagePath = _fileManager.GeneratePage(filePath, page);
                    var text     = _pdfToText.Get(pagePath);
                    if (text == null)
                    {
                        var imagePath = _fileManager.GeneratePageInImage(filePath, page);
                        text = await _cognitive.Get(imagePath).ConfigureAwait(false);

                        _fileManager.Delete(imagePath);
                    }
                    documentResult.AddProcessedPage(text, page);
                    _fileManager.Delete(pagePath);
                }

                _fileManager.Delete(filePath);
                documentResult.Success = true;
                _backgroundJobs.Enqueue(() => _callback.Send(documentResult, newItem));
            }
            catch (Exception ex)
            {
                var documentResult = new DocumentResultResponse {
                    Success = false, ErrorMessage = ex.Message
                };
                _backgroundJobs.Enqueue(() => _callback.Send(documentResult, newItem));
            }
        }
Ejemplo n.º 2
0
        public async Task Send(DocumentResultResponse documentResult, NewItemRequest newItem)
        {
            using var client = new HttpClient();
            var jsonContent   = JsonConvert.SerializeObject(documentResult);
            var contentString = new StringContent(jsonContent, Encoding.UTF8, "application/json");

            contentString.Headers.ContentType = new MediaTypeHeaderValue("application/json");
            var response = await client.PutAsync(newItem.CallbackUrl, contentString).ConfigureAwait(false);

            if (response.StatusCode != HttpStatusCode.OK)
            {
                throw new Exception("Error to callback");
            }
        }
Ejemplo n.º 3
0
        public virtual async Task Send(DocumentResultResponse documentResult, string callbackUrl)
        {
            Console.WriteLine($"Url to response client: {callbackUrl}");
            var resultAtJson = JsonConvert.SerializeObject(documentResult);
            var content      = new StringContent(resultAtJson, Encoding.UTF8, "application/json");

            content.Headers.ContentType = new MediaTypeHeaderValue("application/json");
            var response = await client.PutAsync(callbackUrl, content).ConfigureAwait(false);

            if (response.StatusCode != HttpStatusCode.OK)
            {
                var clientResponse = await response.Content.ReadAsStringAsync().ConfigureAwait(false);

                throw new Exception($"Error to response client: {clientResponse}");
            }
        }
Ejemplo n.º 4
0
        private async Task RunInParallel(string filePath, int numberOfPages, DocumentResultResponse documentResult)
        {
            var numberOfItensInParallel = numberOfPages > EnvironmentVariables.NumberMaxDocumentsInParallel ?
                                          EnvironmentVariables.NumberMaxDocumentsInParallel : numberOfPages;
            var skip      = 0;
            var processed = 0;

            while (numberOfItensInParallel > 0)
            {
                var tasks = documentResult.Pages.Skip(skip).Take(numberOfItensInParallel).Select(page => ProcessPage(page, filePath));
                await Task.WhenAll(tasks).ConfigureAwait(false);

                processed += numberOfItensInParallel;
                skip      += numberOfItensInParallel;
                numberOfItensInParallel = numberOfPages > processed + numberOfItensInParallel ?
                                          numberOfItensInParallel :
                                          numberOfPages - processed;
            }
        }
Ejemplo n.º 5
0
        private async Task <DocumentResultResponse> ExtractOcr(NewFileToProcess newItem)
        {
            try
            {
                var filePath = await _fileManager.Download(newItem.Url).ConfigureAwait(false);

                var text           = await $"pdftotext {filePath} -".Bash().ConfigureAwait(false);
                var numberOfPages  = _fileManager.GetNumberOfPages(filePath);
                var documentResult = new DocumentResultResponse(numberOfPages, newItem.DocumentIdentifier, newItem.AccessKey);
                await RunInParallel(filePath, numberOfPages, documentResult).ConfigureAwait(false);

                Console.WriteLine($"Finishing document with URL {newItem.Url}");
                _fileManager.Delete(filePath);
                documentResult.Success = true;
                return(documentResult);
            }
            catch (Exception ex) {
                return(new DocumentResultResponse(ex.Message, newItem.DocumentIdentifier, newItem.AccessKey));
            }
        }