public ChunkInfo(string chunkText) { var text = chunkText.Trim(); Text = text; Summary = ChunkInfoHelper.GetChunksummary(text); }
public ChunkInfo(int chunkNumber, string chunkText, int?startPage, int?endPage) { ChunkNumber = chunkNumber; var text = chunkText.Trim(); Text = text; CharCount = text.Length; StartPage = startPage; EndPage = endPage; Summary = ChunkInfoHelper.GetChunksummary(text); }
public async Task ExtractText(StorageType sourceStorageType, StorageType destinationStorageType, ChunkMethod chunkType) { InitializeStorage(sourceStorageType, destinationStorageType); var charLimit = _configurationService.GetChunkerConfigModel().CharLimit; var chunkLevel = _configurationService.GetChunkerConfigModel().ChunkSectionLevel; var convertedFiles = new ConcurrentBag <string>(); var failedFiles = new ConcurrentDictionary <string, string>(); // read files from source storage var fileNames = await _sourceStorageService.ListFilesAsync(); // parse files var tasks = fileNames.Select(async fileName => { try { // select parser according to type var fileType = Path.GetExtension(fileName); var parsingService = _parserPoolManager.GetParser(fileType, fileName); // read file _loggerService.LogOperation(OperationType.ReadingFile, fileName); var file = await _sourceStorageService.ReadFileAsync(fileName); // parse file _loggerService.LogOperation(OperationType.ParsingFile, fileName); var parseResult = await parsingService.ParseFile(file); // chunk file _loggerService.LogOperation(OperationType.ChunkingFile, fileName); var chunkedText = _chunkerService.Chunk(parseResult, chunkType, charLimit, chunkLevel); // store file _loggerService.LogOperation(OperationType.StoringResult, fileName); foreach (var item in chunkedText.Select((value, i) => (value, i))) { var newFileName = ChunkInfoHelper.GetChunkFileName(fileName, item.i); await _destinationStorageService.StoreDataAsync(item.value.Text, newFileName); } convertedFiles.Add(fileName); } catch (CliException e) { failedFiles[fileName] = e.Message; _loggerService.LogError(e); } }); await Task.WhenAll(tasks); _loggerService.LogParsingResult(convertedFiles, failedFiles); }
public async Task ChunkTextAsync(StorageType sourceStorageType, StorageType destinationStorageType) { InitializeStorage(sourceStorageType, destinationStorageType); var charLimit = _configurationService.GetChunkerConfigModel().CharLimit; var convertedFiles = new ConcurrentBag <string>(); var failedFiles = new ConcurrentDictionary <string, string>(); // read files from source storage var fileNames = await _sourceStorageService.ListFilesAsync(); // chunk files var tasks = fileNames.Select(async fileName => { try { // validate types _parserService.ValidateFileType(fileName); // read file _loggerService.LogOperation(OperationType.ReadingFile, fileName); var file = await _sourceStorageService.ReadFileAsync(fileName); // parse file var parsedFile = await _parserService.ParseFile(file); // chunk file _loggerService.LogOperation(OperationType.ChunkingFile, fileName); List <ChunkInfo> chunkedText = _chunkerService.Chunk(parsedFile, ChunkMethod.Char, charLimit, ElementType.Other); // store file _loggerService.LogOperation(OperationType.StoringResult, fileName); foreach (var item in chunkedText.Select((value, i) => (value, i))) { var newFileName = ChunkInfoHelper.GetChunkFileName(fileName, item.i); await _destinationStorageService.StoreDataAsync(item.value.Text, newFileName); } convertedFiles.Add(fileName); } catch (CliException e) { failedFiles[fileName] = e.Message; _loggerService.LogError(e); } }); await Task.WhenAll(tasks); _loggerService.LogParsingResult(convertedFiles, failedFiles); }