示例#1
0
        // Does the work of analyzing PDFs. Start by saving the file to S3, as this is currently the only
        // way for textract to analyze PDFs. Then, point the analysis request to the S3 bucket and begin
        // processing. Finally, wait for the results of processing.
        private async Task <GetDocumentAnalysisResponse> StartPDFAnalysis(IFormFile file, List <string> featureTypes)
        {
            // Upload PDF to S3, with guid as file key
            var k = Guid.NewGuid();

            PDFtoS3Bucket(file, k.ToString()).Wait();

            // Create S3 obj to hand to Textract
            var s3 = new Amazon.Textract.Model.S3Object();

            s3.Bucket = Environment.GetEnvironmentVariable("BUCKET_NAME");
            s3.Name   = k.ToString();
            var r = new StartDocumentAnalysisRequest();

            // Set document for request to S3 obj
            r.DocumentLocation = new DocumentLocation
            {
                S3Object = s3
            };
            r.FeatureTypes = featureTypes;

            // Start analysis
            var response = await this.textractClient.StartDocumentAnalysisAsync(r);

            // Wait for analysis to finish
            var x = new GetDocumentAnalysisRequest();

            x.JobId = response.JobId;

            var results = await GetDocAnalysisResponse(response);

            // Remove PDF from S3
            RemoveFromS3Bucket(k.ToString()).Wait();
            return(results);
        }
示例#2
0
        public override async Task ProcessMessageAsync(QueueEvent queueEvent)
        {
            LogInfo($"queueEvent {JsonConvert.SerializeObject(queueEvent)}");

            if (queueEvent.Event != null)
            {
                LogInfo($"testEvent {JsonConvert.SerializeObject(queueEvent)}");
                return;
            }

            var bucket       = queueEvent.Records.FirstOrDefault()?.S3.Bucket.Name;
            var objectKey    = queueEvent.Records.FirstOrDefault()?.S3.Object.Key;
            var startRequest = new StartDocumentAnalysisRequest {
                DocumentLocation = new DocumentLocation {
                    S3Object = new S3Object {
                        Bucket = bucket,
                        Name   = objectKey
                    }
                },
                NotificationChannel = new NotificationChannel {
                    RoleArn     = _roleArn,
                    SNSTopicArn = _topicArn
                },
                FeatureTypes = new List <string> {
                    "TABLES"
                }
            };

            LogInfo(JsonConvert.SerializeObject(startRequest));
            var response = await _textractClient.StartDocumentAnalysisAsync(startRequest);

            LogInfo(JsonConvert.SerializeObject(response));
        }
        internal virtual StartDocumentAnalysisResponse StartDocumentAnalysis(StartDocumentAnalysisRequest request)
        {
            var options = new InvokeOptions();

            options.RequestMarshaller    = StartDocumentAnalysisRequestMarshaller.Instance;
            options.ResponseUnmarshaller = StartDocumentAnalysisResponseUnmarshaller.Instance;

            return(Invoke <StartDocumentAnalysisResponse>(request, options));
        }
        public async Task <string> StartDocumentAnalysis(string key, string featureType, int maxRetry)
        {
            var request  = new StartDocumentAnalysisRequest();
            var s3Object = new S3Object
            {
                Bucket = bucketName,
                Name   = key
            };

            request.DocumentLocation = new DocumentLocation
            {
                S3Object = s3Object
            };

            int retryTime = 0;

            request.FeatureTypes = new List <string> {
                featureType
            };

            try
            {
                var response = await textractClient.StartDocumentAnalysisAsync(request);

                return(response.JobId);
            }
            catch (AmazonServiceException) //Open jobs exceed maximum concurrent job limit{
            {
                while (retryTime < maxRetry)
                {
                    try
                    {
                        retryTime++;
                        Console.WriteLine("retry -----" + retryTime.ToString() + " times");
                        Thread.Sleep(30000); // 30s
                        var response = await textractClient.StartDocumentAnalysisAsync(request);

                        return(response.JobId);
                    }
                    catch (AmazonServiceException e)
                    {
                        Console.WriteLine(e.Message);
                    }
                }
                throw new Exception("More than 2 jobs using AWS Textract! Retried " + maxRetry.ToString() + " times already.");
            }
        }
示例#5
0
        public async Task <string> StartDocumentAnalysis(string bucketName, string key, string featureType)
        {
            var request  = new StartDocumentAnalysisRequest();
            var s3Object = new S3Object {
                Bucket = bucketName,
                Name   = key
            };

            request.DocumentLocation = new DocumentLocation {
                S3Object = s3Object
            };
            request.FeatureTypes = new List <string> {
                featureType
            };
            var response = await this.textract.StartDocumentAnalysisAsync(request);

            return(response.JobId);
        }
        /// <summary>
        /// Starts asynchronous analysis of an input document for relationships between detected
        /// items such as key and value pairs, tables, and selection elements.
        ///
        ///
        /// <para>
        ///  <code>StartDocumentAnalysis</code> can analyze text in documents that are in JPG,
        /// PNG, and PDF format. The documents are stored in an Amazon S3 bucket. Use <a>DocumentLocation</a>
        /// to specify the bucket name and file name of the document.
        /// </para>
        ///
        /// <para>
        ///  <code>StartDocumentAnalysis</code> returns a job identifier (<code>JobId</code>)
        /// that you use to get the results of the operation. When text analysis is finished,
        /// Amazon Textract publishes a completion status to the Amazon Simple Notification Service
        /// (Amazon SNS) topic that you specify in <code>NotificationChannel</code>. To get the
        /// results of the text analysis operation, first check that the status value published
        /// to the Amazon SNS topic is <code>SUCCEEDED</code>. If so, call <a>GetDocumentAnalysis</a>,
        /// and pass the job identifier (<code>JobId</code>) from the initial call to <code>StartDocumentAnalysis</code>.
        /// </para>
        ///
        /// <para>
        /// For more information, see <a href="https://docs.aws.amazon.com/textract/latest/dg/how-it-works-analyzing.html">Document
        /// Text Analysis</a>.
        /// </para>
        /// </summary>
        /// <param name="request">Container for the necessary parameters to execute the StartDocumentAnalysis service method.</param>
        /// <param name="cancellationToken">
        ///     A cancellation token that can be used by other objects or threads to receive notice of cancellation.
        /// </param>
        ///
        /// <returns>The response from the StartDocumentAnalysis service method, as returned by Textract.</returns>
        /// <exception cref="Amazon.Textract.Model.AccessDeniedException">
        /// You aren't authorized to perform the action.
        /// </exception>
        /// <exception cref="Amazon.Textract.Model.BadDocumentException">
        /// Amazon Textract isn't able to read the document.
        /// </exception>
        /// <exception cref="Amazon.Textract.Model.DocumentTooLargeException">
        /// The document can't be processed because it's too large. The maximum document size
        /// for synchronous operations 5 MB. The maximum document size for asynchronous operations
        /// is 500 MB for PDF format files.
        /// </exception>
        /// <exception cref="Amazon.Textract.Model.IdempotentParameterMismatchException">
        /// A <code>ClientRequestToken</code> input parameter was reused with an operation, but
        /// at least one of the other input parameters is different from the previous call to
        /// the operation.
        /// </exception>
        /// <exception cref="Amazon.Textract.Model.InternalServerErrorException">
        /// Amazon Textract experienced a service issue. Try your call again.
        /// </exception>
        /// <exception cref="Amazon.Textract.Model.InvalidParameterException">
        /// An input parameter violated a constraint. For example, in synchronous operations,
        /// an <code>InvalidParameterException</code> exception occurs when neither of the <code>S3Object</code>
        /// or <code>Bytes</code> values are supplied in the <code>Document</code> request parameter.
        /// Validate your parameter before calling the API operation again.
        /// </exception>
        /// <exception cref="Amazon.Textract.Model.InvalidS3ObjectException">
        /// Amazon Textract is unable to access the S3 object that's specified in the request.
        /// </exception>
        /// <exception cref="Amazon.Textract.Model.LimitExceededException">
        /// An Amazon Textract service limit was exceeded. For example, if you start too many
        /// asynchronous jobs concurrently, calls to start operations (<code>StartDocumentTextDetection</code>,
        /// for example) raise a LimitExceededException exception (HTTP status code: 400) until
        /// the number of concurrently running jobs is below the Amazon Textract service limit.
        /// </exception>
        /// <exception cref="Amazon.Textract.Model.ProvisionedThroughputExceededException">
        /// The number of requests exceeded your throughput limit. If you want to increase this
        /// limit, contact Amazon Textract.
        /// </exception>
        /// <exception cref="Amazon.Textract.Model.ThrottlingException">
        /// Amazon Textract is temporarily unable to process the request. Try your call again.
        /// </exception>
        /// <exception cref="Amazon.Textract.Model.UnsupportedDocumentException">
        /// The format of the input document isn't supported. Amazon Textract supports documents
        /// that are .png or .jpg format.
        /// </exception>
        /// <seealso href="http://docs.aws.amazon.com/goto/WebAPI/textract-2018-06-27/StartDocumentAnalysis">REST API Reference for StartDocumentAnalysis Operation</seealso>
        public virtual Task <StartDocumentAnalysisResponse> StartDocumentAnalysisAsync(StartDocumentAnalysisRequest request, System.Threading.CancellationToken cancellationToken = default(CancellationToken))
        {
            var options = new InvokeOptions();

            options.RequestMarshaller    = StartDocumentAnalysisRequestMarshaller.Instance;
            options.ResponseUnmarshaller = StartDocumentAnalysisResponseUnmarshaller.Instance;

            return(InvokeAsync <StartDocumentAnalysisResponse>(request, options, cancellationToken));
        }