public static void UploadDocuments(SearchIndexClient indexClient, string fileId, string fileName, string ocrText, KeyPhraseResult keyPhraseResult) { List<IndexAction> indexOperations = new List<IndexAction>(); var doc = new Document(); doc.Add("fileId", fileId); doc.Add("fileName", fileName); doc.Add("ocrText", ocrText); doc.Add("keyPhrases", keyPhraseResult.KeyPhrases.ToList()); indexOperations.Add(IndexAction.Upload(doc)); try { indexClient.Documents.Index(new IndexBatch(indexOperations)); } catch (IndexBatchException e) { // Sometimes when your Search service is under load, indexing will fail for some of the documents in // the batch. Depending on your application, you can take compensating actions like delaying and // retrying. For this simple demo, we just log the failed document keys and continue. Console.WriteLine( "Failed to index some of the documents: {0}", String.Join(", ", e.IndexingResults.Where(r => !r.Succeeded).Select(r => r.Key))); } }
static void Main(string[] args) { var searchPath = "pdf"; var outPath = "image"; // Note, this will create a new Azure Search Index for the OCR text Console.WriteLine("Creating Azure Search index..."); AzureSearch.CreateIndex(serviceClient, indexName); // Creating an image directory if (Directory.Exists(outPath) == false) { Directory.CreateDirectory(outPath); } foreach (var filename in Directory.GetFiles(searchPath, "*.pdf", SearchOption.TopDirectoryOnly)) { Console.WriteLine("Extracting images from {0} \r\n", System.IO.Path.GetFileName(filename)); var images = PdfImageExtractor.ExtractImages(filename); Console.WriteLine("{0} images found.", images.Count); Console.WriteLine(); var directory = System.IO.Path.GetDirectoryName(filename); foreach (var name in images.Keys) { if (name.LastIndexOf(".") + 1 != name.Length) { images[name].Save(Path.Combine(outPath, name)); } } string ocrText = string.Empty; Console.WriteLine("Extracting text from image... \r\n"); foreach (var imagefilename in Directory.GetFiles(outPath)) { OcrResults ocr = vision.RecognizeText(imagefilename); ocrText += vision.GetRetrieveText(ocr); File.Delete(imagefilename); } Console.WriteLine("Extracting key phrases from processed text... \r\n"); KeyPhraseResult keyPhraseResult = TextExtraction.ProcessText(ocrText); // Take the resulting orcText and upload to a new Azure Search Index // It is highly recommended that you upload documents in batches rather // individually like is done here if (ocrText.Length > 0) { Console.WriteLine("Uploading extracted text to Azure Search...\r\n"); string fileNameOnly = System.IO.Path.GetFileName(filename); string fileId = System.Convert.ToBase64String(System.Text.Encoding.UTF8.GetBytes(fileNameOnly)); AzureSearch.UploadDocuments(indexClient, fileId, fileNameOnly, ocrText, keyPhraseResult); } } // Execute a test search Console.WriteLine("Execute Search..."); AzureSearch.SearchDocuments(indexClient, "Azure Search"); Console.WriteLine("All done. Press any key to continue."); Console.ReadLine(); }
public static void UploadDocuments(SearchIndexClient indexClient, string fileId, string fileName, string ocrText, KeyPhraseResult keyPhraseResult) { List <IndexAction> indexOperations = new List <IndexAction>(); var doc = new Document(); doc.Add("fileId", fileId); doc.Add("fileName", fileName); doc.Add("ocrText", ocrText); doc.Add("keyPhrases", keyPhraseResult.KeyPhrases.ToList()); indexOperations.Add(IndexAction.Upload(doc)); try { indexClient.Documents.Index(new IndexBatch(indexOperations)); } catch (IndexBatchException e) { // Sometimes when your Search service is under load, indexing will fail for some of the documents in // the batch. Depending on your application, you can take compensating actions like delaying and // retrying. For this simple demo, we just log the failed document keys and continue. Console.WriteLine( "Failed to index some of the documents: {0}", String.Join(", ", e.IndexingResults.Where(r => !r.Succeeded).Select(r => r.Key))); } }
public static KeyPhraseResult ProcessText(string inputText) { string accountKey = ConfigurationManager.AppSettings["textExtractionAccountKey"]; KeyPhraseResult keyPhraseResult = new KeyPhraseResult(); using (var httpClient = new HttpClient()) { string inputTextEncoded = HttpUtility.UrlEncode(inputText); httpClient.BaseAddress = new Uri(ServiceBaseUri); string creds = "AccountKey:" + accountKey; string authorizationHeader = "Basic " + Convert.ToBase64String(Encoding.ASCII.GetBytes(creds)); httpClient.DefaultRequestHeaders.Add("Authorization", authorizationHeader); httpClient.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/json")); // get key phrases string keyPhrasesRequest = "data.ashx/amla/text-analytics/v1/GetKeyPhrases?Text=" + inputTextEncoded; Task <HttpResponseMessage> responseTask = httpClient.GetAsync(keyPhrasesRequest); responseTask.Wait(); HttpResponseMessage response = responseTask.Result; Task <string> contentTask = response.Content.ReadAsStringAsync(); contentTask.Wait(); string content = contentTask.Result; if (!response.IsSuccessStatusCode) { throw new Exception("Call to get key phrases failed with HTTP status code: " + response.StatusCode + " and contents: " + content); } keyPhraseResult = JsonConvert.DeserializeObject <KeyPhraseResult>(content); Console.WriteLine("Key phrases: {0} \r\n", string.Join(",", keyPhraseResult.KeyPhrases)); // Uncomment the following if you want to retrieve additional details on this text //// get sentiment //string sentimentRequest = "data.ashx/amla/text-analytics/v1/GetSentiment?Text=" + inputTextEncoded; //responseTask = httpClient.GetAsync(sentimentRequest); //responseTask.Wait(); //response = responseTask.Result; //contentTask = response.Content.ReadAsStringAsync(); //contentTask.Wait(); //content = contentTask.Result; //if (!response.IsSuccessStatusCode) //{ // throw new Exception("Call to get sentiment failed with HTTP status code: " + // response.StatusCode + " and contents: " + content); //} //SentimentResult sentimentResult = JsonConvert.DeserializeObject<SentimentResult>(content); //Console.WriteLine("Sentiment score: " + sentimentResult.Score); //// get the language in text //string languageRequest = "data.ashx/amla/text-analytics/v1/GetLanguage?Text=" + inputTextEncoded; //responseTask = httpClient.GetAsync(languageRequest); //responseTask.Wait(); //response = responseTask.Result; //contentTask = response.Content.ReadAsStringAsync(); //contentTask.Wait(); //content = contentTask.Result; //if (!response.IsSuccessStatusCode) //{ // throw new Exception("Call to get language failed with HTTP status code: " + // response.StatusCode + " and contents: " + content); //} //LanguageResult languageResult = JsonConvert.DeserializeObject<LanguageResult>(content); //Console.WriteLine("Detected Languages: " + string.Join(",", languageResult.DetectedLanguages.Select(language => language.Name).ToArray())); } return(keyPhraseResult); }
public static void UploadDocuments(SearchIndexClient indexClient, string fileId, string fileName, string ocrText, KeyPhraseResult keyPhraseResult) { List <IndexAction> indexOperations = new List <IndexAction>(); var doc = new Document(); doc.Add("fileId", fileId); doc.Add("fileName", fileName); doc.Add("ocrText", ocrText); doc.Add("keyPhrases", keyPhraseResult.KeyPhrases.ToList()); indexOperations.Add(IndexAction.Upload(doc)); try { indexClient.Documents.Index(new IndexBatch(indexOperations)); } catch (IndexBatchException e) { Console.WriteLine( "Failed to index some of the documents: {0}", String.Join(", ", e.IndexingResults.Where(r => !r.Succeeded).Select(r => r.Key))); } }
public static KeyPhraseResult ProcessText(string inputText) { string accountKey = ConfigurationManager.AppSettings["textExtractionAccountKey"]; KeyPhraseResult keyPhraseResult = new KeyPhraseResult(); using (var httpClient = new HttpClient()) { string inputTextEncoded = HttpUtility.UrlEncode(inputText); httpClient.BaseAddress = new Uri(ServiceBaseUri); string creds = "AccountKey:" + accountKey; string authorizationHeader = "Basic " + Convert.ToBase64String(Encoding.ASCII.GetBytes(creds)); httpClient.DefaultRequestHeaders.Add("Authorization", authorizationHeader); httpClient.DefaultRequestHeaders.Accept.Add(new MediaTypeWithQualityHeaderValue("application/json")); // get key phrases string keyPhrasesRequest = "data.ashx/amla/text-analytics/v1/GetKeyPhrases?Text=" + inputTextEncoded; Task<HttpResponseMessage> responseTask = httpClient.GetAsync(keyPhrasesRequest); responseTask.Wait(); HttpResponseMessage response = responseTask.Result; Task<string> contentTask = response.Content.ReadAsStringAsync(); contentTask.Wait(); string content = contentTask.Result; if (!response.IsSuccessStatusCode) { throw new Exception("Call to get key phrases failed with HTTP status code: " + response.StatusCode + " and contents: " + content); } keyPhraseResult = JsonConvert.DeserializeObject<KeyPhraseResult>(content); Console.WriteLine("Key phrases: {0} \r\n", string.Join(",", keyPhraseResult.KeyPhrases)); // Uncomment the following if you want to retrieve additional details on this text //// get sentiment //string sentimentRequest = "data.ashx/amla/text-analytics/v1/GetSentiment?Text=" + inputTextEncoded; //responseTask = httpClient.GetAsync(sentimentRequest); //responseTask.Wait(); //response = responseTask.Result; //contentTask = response.Content.ReadAsStringAsync(); //contentTask.Wait(); //content = contentTask.Result; //if (!response.IsSuccessStatusCode) //{ // throw new Exception("Call to get sentiment failed with HTTP status code: " + // response.StatusCode + " and contents: " + content); //} //SentimentResult sentimentResult = JsonConvert.DeserializeObject<SentimentResult>(content); //Console.WriteLine("Sentiment score: " + sentimentResult.Score); //// get the language in text //string languageRequest = "data.ashx/amla/text-analytics/v1/GetLanguage?Text=" + inputTextEncoded; //responseTask = httpClient.GetAsync(languageRequest); //responseTask.Wait(); //response = responseTask.Result; //contentTask = response.Content.ReadAsStringAsync(); //contentTask.Wait(); //content = contentTask.Result; //if (!response.IsSuccessStatusCode) //{ // throw new Exception("Call to get language failed with HTTP status code: " + // response.StatusCode + " and contents: " + content); //} //LanguageResult languageResult = JsonConvert.DeserializeObject<LanguageResult>(content); //Console.WriteLine("Detected Languages: " + string.Join(",", languageResult.DetectedLanguages.Select(language => language.Name).ToArray())); } return keyPhraseResult; }