Пример #1
0
        // Gets the result of a PDF analysis request.
        private async Task <GetDocumentAnalysisResponse> GetDocAnalysisResponse(StartDocumentAnalysisResponse response)
        {
            // Get jobID from start analysis response
            var x = new GetDocumentAnalysisRequest();

            x.JobId = response.JobId;

            // Poll for analysis to finish. Can take over 15sec. to
            // get results, thus the somewhat long delay.
            GetDocumentAnalysisResponse res = await this.textractClient.GetDocumentAnalysisAsync(x);

            int c = 0;

            while (res.JobStatus != "SUCCEEDED")
            {
                await Task.Delay(200);

                res = await this.textractClient.GetDocumentAnalysisAsync(x);

                c++;
                System.Diagnostics.Debug.WriteLine("Trying again.... " + res.JobStatus + " Attempt " + c);
            }

            return(res);
        }
Пример #2
0
        // Does the work of analyzing PDFs. Start by saving the file to S3, as this is currently the only
        // way for textract to analyze PDFs. Then, point the analysis request to the S3 bucket and begin
        // processing. Finally, wait for the results of processing.
        private async Task <GetDocumentAnalysisResponse> StartPDFAnalysis(IFormFile file, List <string> featureTypes)
        {
            // Upload PDF to S3, with guid as file key
            var k = Guid.NewGuid();

            PDFtoS3Bucket(file, k.ToString()).Wait();

            // Create S3 obj to hand to Textract
            var s3 = new Amazon.Textract.Model.S3Object();

            s3.Bucket = Environment.GetEnvironmentVariable("BUCKET_NAME");
            s3.Name   = k.ToString();
            var r = new StartDocumentAnalysisRequest();

            // Set document for request to S3 obj
            r.DocumentLocation = new DocumentLocation
            {
                S3Object = s3
            };
            r.FeatureTypes = featureTypes;

            // Start analysis
            var response = await this.textractClient.StartDocumentAnalysisAsync(r);

            // Wait for analysis to finish
            var x = new GetDocumentAnalysisRequest();

            x.JobId = response.JobId;

            var results = await GetDocAnalysisResponse(response);

            // Remove PDF from S3
            RemoveFromS3Bucket(k.ToString()).Wait();
            return(results);
        }
Пример #3
0
        /// <summary>
        /// Gets the results for an Amazon Textract asynchronous operation that analyzes text
        /// in a document.
        ///
        ///
        /// <para>
        /// You start asynchronous text analysis by calling <a>StartDocumentAnalysis</a>, which
        /// returns a job identifier (<code>JobId</code>). When the text analysis operation finishes,
        /// Amazon Textract publishes a completion status to the Amazon Simple Notification Service
        /// (Amazon SNS) topic that's registered in the initial call to <code>StartDocumentAnalysis</code>.
        /// To get the results of the text-detection operation, first check that the status value
        /// published to the Amazon SNS topic is <code>SUCCEEDED</code>. If so, call <code>GetDocumentAnalysis</code>,
        /// and pass the job identifier (<code>JobId</code>) from the initial call to <code>StartDocumentAnalysis</code>.
        /// </para>
        ///
        /// <para>
        ///  <code>GetDocumentAnalysis</code> returns an array of <a>Block</a> objects. The following
        /// types of information are returned:
        /// </para>
        ///  <ul> <li>
        /// <para>
        /// Words and lines that are related to nearby lines and words. The related information
        /// is returned in two <a>Block</a> objects each of type <code>KEY_VALUE_SET</code>: a
        /// KEY Block object and a VALUE Block object. For example, <i>Name: Ana Silva Carolina</i>
        /// contains a key and value. <i>Name:</i> is the key. <i>Ana Silva Carolina</i> is the
        /// value.
        /// </para>
        ///  </li> <li>
        /// <para>
        /// Table and table cell data. A TABLE Block object contains information about a detected
        /// table. A CELL Block object is returned for each cell in a table.
        /// </para>
        ///  </li> <li>
        /// <para>
        /// Selectable elements such as checkboxes and radio buttons. A SELECTION_ELEMENT Block
        /// object contains information about a selectable element.
        /// </para>
        ///  </li> <li>
        /// <para>
        /// Lines and words of text. A LINE Block object contains one or more WORD Block objects.
        /// </para>
        ///  </li> </ul>
        /// <para>
        /// Use the <code>MaxResults</code> parameter to limit the number of blocks returned.
        /// If there are more results than specified in <code>MaxResults</code>, the value of
        /// <code>NextToken</code> in the operation response contains a pagination token for getting
        /// the next set of results. To get the next page of results, call <code>GetDocumentAnalysis</code>,
        /// and populate the <code>NextToken</code> request parameter with the token value that's
        /// returned from the previous call to <code>GetDocumentAnalysis</code>.
        /// </para>
        ///
        /// <para>
        /// For more information, see <a href="https://docs.aws.amazon.com/textract/latest/dg/how-it-works-analyzing.html">Document
        /// Text Analysis</a>.
        /// </para>
        /// </summary>
        /// <param name="request">Container for the necessary parameters to execute the GetDocumentAnalysis service method.</param>
        /// <param name="cancellationToken">
        ///     A cancellation token that can be used by other objects or threads to receive notice of cancellation.
        /// </param>
        ///
        /// <returns>The response from the GetDocumentAnalysis service method, as returned by Textract.</returns>
        /// <exception cref="Amazon.Textract.Model.AccessDeniedException">
        /// You aren't authorized to perform the action.
        /// </exception>
        /// <exception cref="Amazon.Textract.Model.InternalServerErrorException">
        /// Amazon Textract experienced a service issue. Try your call again.
        /// </exception>
        /// <exception cref="Amazon.Textract.Model.InvalidJobIdException">
        /// An invalid job identifier was passed to <a>GetDocumentAnalysis</a> or to <a>GetDocumentAnalysis</a>.
        /// </exception>
        /// <exception cref="Amazon.Textract.Model.InvalidParameterException">
        /// An input parameter violated a constraint. For example, in synchronous operations,
        /// an <code>InvalidParameterException</code> exception occurs when neither of the <code>S3Object</code>
        /// or <code>Bytes</code> values are supplied in the <code>Document</code> request parameter.
        /// Validate your parameter before calling the API operation again.
        /// </exception>
        /// <exception cref="Amazon.Textract.Model.ProvisionedThroughputExceededException">
        /// The number of requests exceeded your throughput limit. If you want to increase this
        /// limit, contact Amazon Textract.
        /// </exception>
        /// <exception cref="Amazon.Textract.Model.ThrottlingException">
        /// Amazon Textract is temporarily unable to process the request. Try your call again.
        /// </exception>
        /// <seealso href="http://docs.aws.amazon.com/goto/WebAPI/textract-2018-06-27/GetDocumentAnalysis">REST API Reference for GetDocumentAnalysis Operation</seealso>
        public virtual Task <GetDocumentAnalysisResponse> GetDocumentAnalysisAsync(GetDocumentAnalysisRequest request, System.Threading.CancellationToken cancellationToken = default(CancellationToken))
        {
            var options = new InvokeOptions();

            options.RequestMarshaller    = GetDocumentAnalysisRequestMarshaller.Instance;
            options.ResponseUnmarshaller = GetDocumentAnalysisResponseUnmarshaller.Instance;

            return(InvokeAsync <GetDocumentAnalysisResponse>(request, options, cancellationToken));
        }
Пример #4
0
        internal virtual GetDocumentAnalysisResponse GetDocumentAnalysis(GetDocumentAnalysisRequest request)
        {
            var options = new InvokeOptions();

            options.RequestMarshaller    = GetDocumentAnalysisRequestMarshaller.Instance;
            options.ResponseUnmarshaller = GetDocumentAnalysisResponseUnmarshaller.Instance;

            return(Invoke <GetDocumentAnalysisResponse>(request, options));
        }
Пример #5
0
        public override async Task ProcessMessageAsync(TextractNotification queueEvent)
        {
            LogInfo($"notificationMessage {JsonConvert.SerializeObject(queueEvent)}");
            if (queueEvent.Status == JobStatus.SUCCEEDED)
            {
                try {
                    var    jobId            = queueEvent.JobId;
                    string nextToken        = null;
                    var    counter          = 0;
                    var    manifestContents = new List <string>();
                    do
                    {
                        // get the result from the job
                        var getRequest = new GetDocumentAnalysisRequest {
                            JobId     = jobId,
                            NextToken = nextToken
                        };
                        LogInfo(JsonConvert.SerializeObject(getRequest));
                        var response = await _textractClient.GetDocumentAnalysisAsync(getRequest);

                        // LogInfo(JsonConvert.SerializeObject(response));

                        // save response to s3
                        var filePath = $"results/{jobId}/results_{counter}.json";
                        using (var memoryStream = new MemoryStream()) {
                            using (var streamWriter = new StreamWriter(memoryStream)) {
                                streamWriter.Write(JsonConvert.SerializeObject(response));
                                streamWriter.Flush();
                                var s3Response = await _s3Client.PutObjectAsync(new PutObjectRequest {
                                    BucketName  = _bucketName,
                                    Key         = filePath,
                                    InputStream = memoryStream
                                });

                                if (s3Response == null || s3Response.HttpStatusCode != (HttpStatusCode)200)
                                {
                                    throw new Exception("Unable to save results file to s3");
                                }
                            }
                        }

                        // add to the manifest
                        manifestContents.Add(filePath);

                        // get the next response token
                        nextToken = response.NextToken;
                        counter++;
                    } while (nextToken != null);

                    var manifestContentsString = string.Join(';', manifestContents);
                    LogInfo(manifestContentsString);

                    // write manifest
                    using (var memoryStream1 = new MemoryStream()) {
                        using (var streamWriter1 = new StreamWriter(memoryStream1)) {
                            streamWriter1.WriteLine(manifestContentsString);
                            streamWriter1.Flush();
                            var s3ManifestResponse = await _s3Client.PutObjectAsync(new PutObjectRequest {
                                BucketName  = _bucketName,
                                Key         = $"results/{jobId}/manifest",
                                InputStream = memoryStream1
                            });

                            if (s3ManifestResponse == null || s3ManifestResponse.HttpStatusCode != (HttpStatusCode)200)
                            {
                                throw new Exception("Unable to save manifest file to s3");
                            }
                        }
                    }
                    Thread.Sleep(100);
                }
                catch (Exception e) {
                    LogError(e);
                }
            }
        }