// Does the work of analyzing PDFs. Start by saving the file to S3, as this is currently the only // way for textract to analyze PDFs. Then, point the analysis request to the S3 bucket and begin // processing. Finally, wait for the results of processing. private async Task <GetDocumentAnalysisResponse> StartPDFAnalysis(IFormFile file, List <string> featureTypes) { // Upload PDF to S3, with guid as file key var k = Guid.NewGuid(); PDFtoS3Bucket(file, k.ToString()).Wait(); // Create S3 obj to hand to Textract var s3 = new Amazon.Textract.Model.S3Object(); s3.Bucket = Environment.GetEnvironmentVariable("BUCKET_NAME"); s3.Name = k.ToString(); var r = new StartDocumentAnalysisRequest(); // Set document for request to S3 obj r.DocumentLocation = new DocumentLocation { S3Object = s3 }; r.FeatureTypes = featureTypes; // Start analysis var response = await this.textractClient.StartDocumentAnalysisAsync(r); // Wait for analysis to finish var x = new GetDocumentAnalysisRequest(); x.JobId = response.JobId; var results = await GetDocAnalysisResponse(response); // Remove PDF from S3 RemoveFromS3Bucket(k.ToString()).Wait(); return(results); }
public override async Task ProcessMessageAsync(QueueEvent queueEvent) { LogInfo($"queueEvent {JsonConvert.SerializeObject(queueEvent)}"); if (queueEvent.Event != null) { LogInfo($"testEvent {JsonConvert.SerializeObject(queueEvent)}"); return; } var bucket = queueEvent.Records.FirstOrDefault()?.S3.Bucket.Name; var objectKey = queueEvent.Records.FirstOrDefault()?.S3.Object.Key; var startRequest = new StartDocumentAnalysisRequest { DocumentLocation = new DocumentLocation { S3Object = new S3Object { Bucket = bucket, Name = objectKey } }, NotificationChannel = new NotificationChannel { RoleArn = _roleArn, SNSTopicArn = _topicArn }, FeatureTypes = new List <string> { "TABLES" } }; LogInfo(JsonConvert.SerializeObject(startRequest)); var response = await _textractClient.StartDocumentAnalysisAsync(startRequest); LogInfo(JsonConvert.SerializeObject(response)); }
internal virtual StartDocumentAnalysisResponse StartDocumentAnalysis(StartDocumentAnalysisRequest request) { var options = new InvokeOptions(); options.RequestMarshaller = StartDocumentAnalysisRequestMarshaller.Instance; options.ResponseUnmarshaller = StartDocumentAnalysisResponseUnmarshaller.Instance; return(Invoke <StartDocumentAnalysisResponse>(request, options)); }
public async Task <string> StartDocumentAnalysis(string key, string featureType, int maxRetry) { var request = new StartDocumentAnalysisRequest(); var s3Object = new S3Object { Bucket = bucketName, Name = key }; request.DocumentLocation = new DocumentLocation { S3Object = s3Object }; int retryTime = 0; request.FeatureTypes = new List <string> { featureType }; try { var response = await textractClient.StartDocumentAnalysisAsync(request); return(response.JobId); } catch (AmazonServiceException) //Open jobs exceed maximum concurrent job limit{ { while (retryTime < maxRetry) { try { retryTime++; Console.WriteLine("retry -----" + retryTime.ToString() + " times"); Thread.Sleep(30000); // 30s var response = await textractClient.StartDocumentAnalysisAsync(request); return(response.JobId); } catch (AmazonServiceException e) { Console.WriteLine(e.Message); } } throw new Exception("More than 2 jobs using AWS Textract! Retried " + maxRetry.ToString() + " times already."); } }
public async Task <string> StartDocumentAnalysis(string bucketName, string key, string featureType) { var request = new StartDocumentAnalysisRequest(); var s3Object = new S3Object { Bucket = bucketName, Name = key }; request.DocumentLocation = new DocumentLocation { S3Object = s3Object }; request.FeatureTypes = new List <string> { featureType }; var response = await this.textract.StartDocumentAnalysisAsync(request); return(response.JobId); }
/// <summary> /// Starts asynchronous analysis of an input document for relationships between detected /// items such as key and value pairs, tables, and selection elements. /// /// /// <para> /// <code>StartDocumentAnalysis</code> can analyze text in documents that are in JPG, /// PNG, and PDF format. The documents are stored in an Amazon S3 bucket. Use <a>DocumentLocation</a> /// to specify the bucket name and file name of the document. /// </para> /// /// <para> /// <code>StartDocumentAnalysis</code> returns a job identifier (<code>JobId</code>) /// that you use to get the results of the operation. When text analysis is finished, /// Amazon Textract publishes a completion status to the Amazon Simple Notification Service /// (Amazon SNS) topic that you specify in <code>NotificationChannel</code>. To get the /// results of the text analysis operation, first check that the status value published /// to the Amazon SNS topic is <code>SUCCEEDED</code>. If so, call <a>GetDocumentAnalysis</a>, /// and pass the job identifier (<code>JobId</code>) from the initial call to <code>StartDocumentAnalysis</code>. /// </para> /// /// <para> /// For more information, see <a href="https://docs.aws.amazon.com/textract/latest/dg/how-it-works-analyzing.html">Document /// Text Analysis</a>. /// </para> /// </summary> /// <param name="request">Container for the necessary parameters to execute the StartDocumentAnalysis service method.</param> /// <param name="cancellationToken"> /// A cancellation token that can be used by other objects or threads to receive notice of cancellation. /// </param> /// /// <returns>The response from the StartDocumentAnalysis service method, as returned by Textract.</returns> /// <exception cref="Amazon.Textract.Model.AccessDeniedException"> /// You aren't authorized to perform the action. /// </exception> /// <exception cref="Amazon.Textract.Model.BadDocumentException"> /// Amazon Textract isn't able to read the document. /// </exception> /// <exception cref="Amazon.Textract.Model.DocumentTooLargeException"> /// The document can't be processed because it's too large. The maximum document size /// for synchronous operations 5 MB. The maximum document size for asynchronous operations /// is 500 MB for PDF format files. /// </exception> /// <exception cref="Amazon.Textract.Model.IdempotentParameterMismatchException"> /// A <code>ClientRequestToken</code> input parameter was reused with an operation, but /// at least one of the other input parameters is different from the previous call to /// the operation. /// </exception> /// <exception cref="Amazon.Textract.Model.InternalServerErrorException"> /// Amazon Textract experienced a service issue. Try your call again. /// </exception> /// <exception cref="Amazon.Textract.Model.InvalidParameterException"> /// An input parameter violated a constraint. For example, in synchronous operations, /// an <code>InvalidParameterException</code> exception occurs when neither of the <code>S3Object</code> /// or <code>Bytes</code> values are supplied in the <code>Document</code> request parameter. /// Validate your parameter before calling the API operation again. /// </exception> /// <exception cref="Amazon.Textract.Model.InvalidS3ObjectException"> /// Amazon Textract is unable to access the S3 object that's specified in the request. /// </exception> /// <exception cref="Amazon.Textract.Model.LimitExceededException"> /// An Amazon Textract service limit was exceeded. For example, if you start too many /// asynchronous jobs concurrently, calls to start operations (<code>StartDocumentTextDetection</code>, /// for example) raise a LimitExceededException exception (HTTP status code: 400) until /// the number of concurrently running jobs is below the Amazon Textract service limit. /// </exception> /// <exception cref="Amazon.Textract.Model.ProvisionedThroughputExceededException"> /// The number of requests exceeded your throughput limit. If you want to increase this /// limit, contact Amazon Textract. /// </exception> /// <exception cref="Amazon.Textract.Model.ThrottlingException"> /// Amazon Textract is temporarily unable to process the request. Try your call again. /// </exception> /// <exception cref="Amazon.Textract.Model.UnsupportedDocumentException"> /// The format of the input document isn't supported. Amazon Textract supports documents /// that are .png or .jpg format. /// </exception> /// <seealso href="http://docs.aws.amazon.com/goto/WebAPI/textract-2018-06-27/StartDocumentAnalysis">REST API Reference for StartDocumentAnalysis Operation</seealso> public virtual Task <StartDocumentAnalysisResponse> StartDocumentAnalysisAsync(StartDocumentAnalysisRequest request, System.Threading.CancellationToken cancellationToken = default(CancellationToken)) { var options = new InvokeOptions(); options.RequestMarshaller = StartDocumentAnalysisRequestMarshaller.Instance; options.ResponseUnmarshaller = StartDocumentAnalysisResponseUnmarshaller.Instance; return(InvokeAsync <StartDocumentAnalysisResponse>(request, options, cancellationToken)); }