// Gets the result of a PDF analysis request. private async Task <GetDocumentAnalysisResponse> GetDocAnalysisResponse(StartDocumentAnalysisResponse response) { // Get jobID from start analysis response var x = new GetDocumentAnalysisRequest(); x.JobId = response.JobId; // Poll for analysis to finish. Can take over 15sec. to // get results, thus the somewhat long delay. GetDocumentAnalysisResponse res = await this.textractClient.GetDocumentAnalysisAsync(x); int c = 0; while (res.JobStatus != "SUCCEEDED") { await Task.Delay(200); res = await this.textractClient.GetDocumentAnalysisAsync(x); c++; System.Diagnostics.Debug.WriteLine("Trying again.... " + res.JobStatus + " Attempt " + c); } return(res); }
// Does the work of analyzing PDFs. Start by saving the file to S3, as this is currently the only // way for textract to analyze PDFs. Then, point the analysis request to the S3 bucket and begin // processing. Finally, wait for the results of processing. private async Task <GetDocumentAnalysisResponse> StartPDFAnalysis(IFormFile file, List <string> featureTypes) { // Upload PDF to S3, with guid as file key var k = Guid.NewGuid(); PDFtoS3Bucket(file, k.ToString()).Wait(); // Create S3 obj to hand to Textract var s3 = new Amazon.Textract.Model.S3Object(); s3.Bucket = Environment.GetEnvironmentVariable("BUCKET_NAME"); s3.Name = k.ToString(); var r = new StartDocumentAnalysisRequest(); // Set document for request to S3 obj r.DocumentLocation = new DocumentLocation { S3Object = s3 }; r.FeatureTypes = featureTypes; // Start analysis var response = await this.textractClient.StartDocumentAnalysisAsync(r); // Wait for analysis to finish var x = new GetDocumentAnalysisRequest(); x.JobId = response.JobId; var results = await GetDocAnalysisResponse(response); // Remove PDF from S3 RemoveFromS3Bucket(k.ToString()).Wait(); return(results); }
/// <summary> /// Gets the results for an Amazon Textract asynchronous operation that analyzes text /// in a document. /// /// /// <para> /// You start asynchronous text analysis by calling <a>StartDocumentAnalysis</a>, which /// returns a job identifier (<code>JobId</code>). When the text analysis operation finishes, /// Amazon Textract publishes a completion status to the Amazon Simple Notification Service /// (Amazon SNS) topic that's registered in the initial call to <code>StartDocumentAnalysis</code>. /// To get the results of the text-detection operation, first check that the status value /// published to the Amazon SNS topic is <code>SUCCEEDED</code>. If so, call <code>GetDocumentAnalysis</code>, /// and pass the job identifier (<code>JobId</code>) from the initial call to <code>StartDocumentAnalysis</code>. /// </para> /// /// <para> /// <code>GetDocumentAnalysis</code> returns an array of <a>Block</a> objects. The following /// types of information are returned: /// </para> /// <ul> <li> /// <para> /// Words and lines that are related to nearby lines and words. The related information /// is returned in two <a>Block</a> objects each of type <code>KEY_VALUE_SET</code>: a /// KEY Block object and a VALUE Block object. For example, <i>Name: Ana Silva Carolina</i> /// contains a key and value. <i>Name:</i> is the key. <i>Ana Silva Carolina</i> is the /// value. /// </para> /// </li> <li> /// <para> /// Table and table cell data. A TABLE Block object contains information about a detected /// table. A CELL Block object is returned for each cell in a table. /// </para> /// </li> <li> /// <para> /// Selectable elements such as checkboxes and radio buttons. A SELECTION_ELEMENT Block /// object contains information about a selectable element. /// </para> /// </li> <li> /// <para> /// Lines and words of text. A LINE Block object contains one or more WORD Block objects. /// </para> /// </li> </ul> /// <para> /// Use the <code>MaxResults</code> parameter to limit the number of blocks returned. /// If there are more results than specified in <code>MaxResults</code>, the value of /// <code>NextToken</code> in the operation response contains a pagination token for getting /// the next set of results. To get the next page of results, call <code>GetDocumentAnalysis</code>, /// and populate the <code>NextToken</code> request parameter with the token value that's /// returned from the previous call to <code>GetDocumentAnalysis</code>. /// </para> /// /// <para> /// For more information, see <a href="https://docs.aws.amazon.com/textract/latest/dg/how-it-works-analyzing.html">Document /// Text Analysis</a>. /// </para> /// </summary> /// <param name="request">Container for the necessary parameters to execute the GetDocumentAnalysis service method.</param> /// <param name="cancellationToken"> /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// </param> /// /// <returns>The response from the GetDocumentAnalysis service method, as returned by Textract.</returns> /// <exception cref="Amazon.Textract.Model.AccessDeniedException"> /// You aren't authorized to perform the action. /// </exception> /// <exception cref="Amazon.Textract.Model.InternalServerErrorException"> /// Amazon Textract experienced a service issue. Try your call again. /// </exception> /// <exception cref="Amazon.Textract.Model.InvalidJobIdException"> /// An invalid job identifier was passed to <a>GetDocumentAnalysis</a> or to <a>GetDocumentAnalysis</a>. /// </exception> /// <exception cref="Amazon.Textract.Model.InvalidParameterException"> /// An input parameter violated a constraint. For example, in synchronous operations, /// an <code>InvalidParameterException</code> exception occurs when neither of the <code>S3Object</code> /// or <code>Bytes</code> values are supplied in the <code>Document</code> request parameter. /// Validate your parameter before calling the API operation again. /// </exception> /// <exception cref="Amazon.Textract.Model.ProvisionedThroughputExceededException"> /// The number of requests exceeded your throughput limit. If you want to increase this /// limit, contact Amazon Textract. /// </exception> /// <exception cref="Amazon.Textract.Model.ThrottlingException"> /// Amazon Textract is temporarily unable to process the request. Try your call again. /// </exception> /// <seealso href="http://docs.aws.amazon.com/goto/WebAPI/textract-2018-06-27/GetDocumentAnalysis">REST API Reference for GetDocumentAnalysis Operation</seealso> public virtual Task <GetDocumentAnalysisResponse> GetDocumentAnalysisAsync(GetDocumentAnalysisRequest request, System.Threading.CancellationToken cancellationToken = default(CancellationToken)) { var options = new InvokeOptions(); options.RequestMarshaller = GetDocumentAnalysisRequestMarshaller.Instance; options.ResponseUnmarshaller = GetDocumentAnalysisResponseUnmarshaller.Instance; return(InvokeAsync <GetDocumentAnalysisResponse>(request, options, cancellationToken)); }
internal virtual GetDocumentAnalysisResponse GetDocumentAnalysis(GetDocumentAnalysisRequest request) { var options = new InvokeOptions(); options.RequestMarshaller = GetDocumentAnalysisRequestMarshaller.Instance; options.ResponseUnmarshaller = GetDocumentAnalysisResponseUnmarshaller.Instance; return(Invoke <GetDocumentAnalysisResponse>(request, options)); }
public override async Task ProcessMessageAsync(TextractNotification queueEvent) { LogInfo($"notificationMessage {JsonConvert.SerializeObject(queueEvent)}"); if (queueEvent.Status == JobStatus.SUCCEEDED) { try { var jobId = queueEvent.JobId; string nextToken = null; var counter = 0; var manifestContents = new List <string>(); do { // get the result from the job var getRequest = new GetDocumentAnalysisRequest { JobId = jobId, NextToken = nextToken }; LogInfo(JsonConvert.SerializeObject(getRequest)); var response = await _textractClient.GetDocumentAnalysisAsync(getRequest); // LogInfo(JsonConvert.SerializeObject(response)); // save response to s3 var filePath = $"results/{jobId}/results_{counter}.json"; using (var memoryStream = new MemoryStream()) { using (var streamWriter = new StreamWriter(memoryStream)) { streamWriter.Write(JsonConvert.SerializeObject(response)); streamWriter.Flush(); var s3Response = await _s3Client.PutObjectAsync(new PutObjectRequest { BucketName = _bucketName, Key = filePath, InputStream = memoryStream }); if (s3Response == null || s3Response.HttpStatusCode != (HttpStatusCode)200) { throw new Exception("Unable to save results file to s3"); } } } // add to the manifest manifestContents.Add(filePath); // get the next response token nextToken = response.NextToken; counter++; } while (nextToken != null); var manifestContentsString = string.Join(';', manifestContents); LogInfo(manifestContentsString); // write manifest using (var memoryStream1 = new MemoryStream()) { using (var streamWriter1 = new StreamWriter(memoryStream1)) { streamWriter1.WriteLine(manifestContentsString); streamWriter1.Flush(); var s3ManifestResponse = await _s3Client.PutObjectAsync(new PutObjectRequest { BucketName = _bucketName, Key = $"results/{jobId}/manifest", InputStream = memoryStream1 }); if (s3ManifestResponse == null || s3ManifestResponse.HttpStatusCode != (HttpStatusCode)200) { throw new Exception("Unable to save manifest file to s3"); } } } Thread.Sleep(100); } catch (Exception e) { LogError(e); } } }