public List <ChunkInfo> Chunk(DocumentTree documentTree, ChunkMethod chunkMethod, int charLimit, ElementType chunkLevel) { switch (chunkMethod) { case ChunkMethod.NoChunking: return(ApplyNoChunking(documentTree)); case ChunkMethod.Char: return(ChunkByCharacterLimit(documentTree, charLimit)); case ChunkMethod.Page: return(ChunkByPage(documentTree, charLimit)); case ChunkMethod.Section: return(ChunkBySection(documentTree, chunkLevel, charLimit)); default: throw new NotSupportedException($"The chunk type {chunkMethod} isn't supported."); } }
public async Task Predict(StorageType sourceStorageType, StorageType destinationStorageType, ChunkMethod chunkType, CognitiveServiceType service) { InitializeStorage(sourceStorageType, destinationStorageType); var charLimit = _configurationService.GetChunkerConfigModel().CharLimit; var chunkLevel = _configurationService.GetChunkerConfigModel().ChunkSectionLevel; var defaultOps = _configurationService.GetTextAnalyticsConfigModel().DefaultOperations; var convertedFiles = new ConcurrentBag <string>(); var failedFiles = new ConcurrentDictionary <string, string>(); // Check which service to run var runCustomText = CognitiveServiceType.CustomText.Equals(service) || CognitiveServiceType.Both.Equals(service); var runTextAnalytics = service == CognitiveServiceType.TextAnalytics || service == CognitiveServiceType.Both; // read files from source storage var fileNames = await _sourceStorageService.ListFilesAsync(); // parse files var tasks = fileNames.Select(async fileName => { try { // select parser according to type var fileType = Path.GetExtension(fileName); var parsingService = _parserPoolManager.GetParser(fileType, fileName); // read file _loggerService.LogOperation(OperationType.ReadingFile, fileName); var file = await _sourceStorageService.ReadFileAsync(fileName); // parse file _loggerService.LogOperation(OperationType.ParsingFile, fileName); var parseResult = await parsingService.ParseFile(file); // chunk file _loggerService.LogOperation(OperationType.ChunkingFile, fileName); var chunkedText = _chunkerService.Chunk(parseResult, chunkType, charLimit, chunkLevel); // prediction service _loggerService.LogOperation(OperationType.RunningPrediction, fileName); var queries = chunkedText.Select(r => r.Text).ToList(); var customTextresponse = runCustomText ? await _customTextPredictionService.GetPredictionBatchAsync(queries) : null; var sentimentResponse = runTextAnalytics && defaultOps.Sentiment ? await _textAnalyticsPredictionService.PredictSentimentBatchAsync(queries) : null; var nerResponse = runTextAnalytics && defaultOps.Ner ? await _textAnalyticsPredictionService.PredictNerBatchAsync(queries) : null; var keyphraseResponse = runTextAnalytics && defaultOps.Keyphrase ? await _textAnalyticsPredictionService.PredictKeyphraseBatchAsync(queries) : null; // concatenation service var concatenatedResponse = _concatenationService.ConcatPredictionResult(chunkedText.ToArray(), customTextresponse, sentimentResponse, nerResponse, keyphraseResponse); var responseAsJson = JsonConvert.SerializeObject(concatenatedResponse, Formatting.Indented); // store file _loggerService.LogOperation(OperationType.StoringResult, fileName); var newFileName = Path.GetFileNameWithoutExtension(fileName) + ".json"; await _destinationStorageService.StoreDataAsync(responseAsJson, newFileName); convertedFiles.Add(fileName); } catch (CliException e) { failedFiles[fileName] = e.Message; _loggerService.LogError(e); } }); await Task.WhenAll(tasks); _loggerService.LogParsingResult(convertedFiles, failedFiles); }
public async Task ExtractText(StorageType sourceStorageType, StorageType destinationStorageType, ChunkMethod chunkType) { InitializeStorage(sourceStorageType, destinationStorageType); var charLimit = _configurationService.GetChunkerConfigModel().CharLimit; var chunkLevel = _configurationService.GetChunkerConfigModel().ChunkSectionLevel; var convertedFiles = new ConcurrentBag <string>(); var failedFiles = new ConcurrentDictionary <string, string>(); // read files from source storage var fileNames = await _sourceStorageService.ListFilesAsync(); // parse files var tasks = fileNames.Select(async fileName => { try { // select parser according to type var fileType = Path.GetExtension(fileName); var parsingService = _parserPoolManager.GetParser(fileType, fileName); // read file _loggerService.LogOperation(OperationType.ReadingFile, fileName); var file = await _sourceStorageService.ReadFileAsync(fileName); // parse file _loggerService.LogOperation(OperationType.ParsingFile, fileName); var parseResult = await parsingService.ParseFile(file); // chunk file _loggerService.LogOperation(OperationType.ChunkingFile, fileName); var chunkedText = _chunkerService.Chunk(parseResult, chunkType, charLimit, chunkLevel); // store file _loggerService.LogOperation(OperationType.StoringResult, fileName); foreach (var item in chunkedText.Select((value, i) => (value, i))) { var newFileName = ChunkInfoHelper.GetChunkFileName(fileName, item.i); await _destinationStorageService.StoreDataAsync(item.value.Text, newFileName); } convertedFiles.Add(fileName); } catch (CliException e) { failedFiles[fileName] = e.Message; _loggerService.LogError(e); } }); await Task.WhenAll(tasks); _loggerService.LogParsingResult(convertedFiles, failedFiles); }