private async Task ProcessFiles(List <RepositoryFile> files, string path, string archiveRecordId) { foreach (var repositoryFile in files) { var diskFile = new FileInfo(Path.Combine(path, repositoryFile.PhysicalName)); if (repositoryFile.Exported) { if (!diskFile.Exists) { Log.Warning("Unable to find file on disk at {diskFile} for {archiveRecordId}", diskFile, archiveRecordId); } // We have found a valid file. Extract the text if the extension is supported var supportedFileTypesForTextExtraction = await textEngine.GetSupportedFileTypes(); if (supportedFileTypesForTextExtraction.Contains(diskFile.Extension.Replace(".", ""))) { Log.Information("Start extracting text for file: {FullName} for archive record id {archiveRecordId}", diskFile.FullName, archiveRecordId); repositoryFile.ContentText = await textEngine.ExtractText(diskFile.FullName); } } else { Log.Information("Skipping {diskFile} as it was not downloaded from the repository", diskFile); } } }
private async Task ProcessFiles(List <RepositoryFile> files, string path, JobContext context) { // Skip empty directories if (files.Count == 0) { return; } var supportedFileTypesForTextExtraction = await textEngine.GetSupportedFileTypes(); // Create the list with the text extraction files. // This list will contain the splitted file names for processing // This list does not contain files that didn't have the flag exported or should be skipped var textExtractionFiles = pdfManipulator.ConvertToTextExtractionFiles(files, path); var sw = new Stopwatch(); sw.Start(); var parallelism = Settings.Default.TextExtractParallelism; Log.Information("Starting parallel ocr extraction for-each-loop with parallelism of {parallelism} for {Count} files of archiveRecordId {archiveRecord}", parallelism, files.Count, context.ArchiveRecordId); await textExtractionFiles.ParallelForEachAsync(async textExtractionFile => { var diskFile = new FileInfo(textExtractionFile.FullName); if (!diskFile.Exists) { Log.Warning("Unable to find file on disk at {diskFile} for {archiveRecordId}", diskFile, context.ArchiveRecordId); } // We have found a valid file. Extract the text if the extension is supported if (supportedFileTypesForTextExtraction.Contains(diskFile.Extension.Replace(".", ""))) { Log.Information("Start extracting text for file: {FullName} for archive record id {archiveRecordId} on thread {threadId}", diskFile.FullName, context.ArchiveRecordId, Thread.CurrentThread.ManagedThreadId); textExtractionFile.ContentText = await textEngine.ExtractText(diskFile.FullName, context); } }, parallelism, true); // Now convert the extracted texts back to the original repository files pdfManipulator.TransferExtractedText(textExtractionFiles, files); sw.Stop(); Log.Information("Finished parallel ocr extraction for-each-loop with parallelism of {parallelism} for {Count} files of archiveRecordId {archiveRecord} in {TotalSeconds}", parallelism, files.Count, context.ArchiveRecordId, sw.Elapsed.TotalSeconds); Log.Debug(JsonConvert.SerializeObject(files)); }