public static void SaveRecordJsonFile(this TextAnalysisRecord source, string pathToSaveTo) { Directory.CreateDirectory(Path.GetDirectoryName(pathToSaveTo)); try { using (StreamWriter file = File.CreateText(pathToSaveTo)) { JsonSerializer serializer = new JsonSerializer(); serializer.Formatting = Formatting.Indented; serializer.Serialize(file, source); } } catch (Exception e) { } }
private static void AnalyzeFolderDocuments(string folderPath, string outputPath) { var batchSize = 20; var files = Directory.EnumerateFiles(folderPath, "*.txt", SearchOption.AllDirectories); //Break down file list into batches of batchSize var filesBatched = files.Select((id, index) => new { id, index }).GroupBy(x => x.index / batchSize). Select(g => g.Select(x => x.id)); foreach (var batch in filesBatched) { //Input we will send to Microsoft's text analytics API as a List var batchInput = new List <MultiLanguageInput>(); var batchRequest = new List <TextAnalyticsRequest>(); foreach (var file in batch) { //Record initial file information for each file to be analyzed var request = new TextAnalyticsRequest() { FileName = Path.GetFileNameWithoutExtension(file), FolderName = Path.GetFileName(Path.GetDirectoryName(file)), FileText = File.ReadAllText(file) }; //Break down file text into chunks of 5000 chars as Microsoft's API only accepts inputs with 5000 char lengths var fileChunks = GetTextChunks(request.FileText); var chunkNum = 0; foreach (var chunkText in fileChunks) { batchInput.Add(new MultiLanguageInput("en", $"{request.FileName}_{++chunkNum}", chunkText)); } request.NumberOfChunks = chunkNum; batchRequest.Add(request); } if (batchRequest.Count() == 0) { continue; } ////var entityBatchResults = s_analyticsService.GetEntities(batchInput, CancellationToken.None).Result; ////var sentimentBatchResults = s_analyticsService.GetSentiment(batchInput, CancellationToken.None).Result; //Focusing more on key phrases than entity or sentiment api endpoints var keyPhraseBatchResults = s_analyticsService.GetKeyPhrases(batchInput, CancellationToken.None).Result; foreach (var request in batchRequest) { //Break down file information with results differently for saving to DB or another file var completeResult = new TextAnalysisRecord() { RequestInfo = request, Fid = Regex.Match(request.FileName, @"FID(\d{9})_?").Groups[1].Value, OrderNumber = Int32.Parse(Regex.Match(request.FileName, @"_OD(\d+)").Groups[1].Value), //Entities = entityBatchResults.Documents.Where(x => x.Id == request.FileName)?.Entities, //EntityErrors = entityBatchResults.Errors.Where(e => e.Id == request.FileName).Select(e => e.Message), //Sentiment = sentimentBatchResults.Documents.Where(x => x.Id == request.FileName)?.Score, //SentimentErrors = sentimentBatchResults.Errors.Where(e => e.Id == request.FileName).Select(e => e.Message), KeyPhrases = keyPhraseBatchResults.Documents.Where(x => x.Id.StartsWith(request.FileName)).SelectMany(x => x.KeyPhrases).ToList(), KeyPhraseErrors = keyPhraseBatchResults.Errors.Where(e => e.Id.StartsWith(request.FileName)).Select(e => e.Message) }; var path = Path.Combine(outputPath, $"{request.FolderName}\\{request.FileName}.json"); //Save the record to a json file for review completeResult.SaveRecordJsonFile(path); } } }