private async Task ProcessFiles(List <RepositoryFile> files, string path, string archiveRecordId)
        {
            foreach (var repositoryFile in files)
            {
                var diskFile = new FileInfo(Path.Combine(path, repositoryFile.PhysicalName));
                if (repositoryFile.Exported)
                {
                    if (!diskFile.Exists)
                    {
                        Log.Warning("Unable to find file on disk at {diskFile} for {archiveRecordId}", diskFile, archiveRecordId);
                    }

                    // We have found a valid file. Extract the text if the extension is supported
                    var supportedFileTypesForTextExtraction = await textEngine.GetSupportedFileTypes();

                    if (supportedFileTypesForTextExtraction.Contains(diskFile.Extension.Replace(".", "")))
                    {
                        Log.Information("Start extracting text for file: {FullName} for archive record id {archiveRecordId}", diskFile.FullName, archiveRecordId);
                        repositoryFile.ContentText = await textEngine.ExtractText(diskFile.FullName);
                    }
                }
                else
                {
                    Log.Information("Skipping {diskFile} as it was not downloaded from the repository", diskFile);
                }
            }
        }
Example #2
0
        private async Task ProcessFiles(List <RepositoryFile> files, string path, JobContext context)
        {
            // Skip empty directories
            if (files.Count == 0)
            {
                return;
            }

            var supportedFileTypesForTextExtraction = await textEngine.GetSupportedFileTypes();

            // Create the list with the text extraction files.
            // This list will contain the splitted file names for processing
            // This list does not contain files that didn't have the flag exported or should be skipped
            var textExtractionFiles = pdfManipulator.ConvertToTextExtractionFiles(files, path);

            var sw = new Stopwatch();

            sw.Start();
            var parallelism = Settings.Default.TextExtractParallelism;

            Log.Information("Starting parallel ocr extraction for-each-loop with parallelism of {parallelism} for {Count} files of archiveRecordId {archiveRecord}",
                            parallelism, files.Count, context.ArchiveRecordId);

            await textExtractionFiles.ParallelForEachAsync(async textExtractionFile =>
            {
                var diskFile = new FileInfo(textExtractionFile.FullName);
                if (!diskFile.Exists)
                {
                    Log.Warning("Unable to find file on disk at {diskFile} for {archiveRecordId}", diskFile, context.ArchiveRecordId);
                }

                // We have found a valid file. Extract the text if the extension is supported
                if (supportedFileTypesForTextExtraction.Contains(diskFile.Extension.Replace(".", "")))
                {
                    Log.Information("Start extracting text for file: {FullName} for archive record id {archiveRecordId} on thread {threadId}", diskFile.FullName,
                                    context.ArchiveRecordId, Thread.CurrentThread.ManagedThreadId);
                    textExtractionFile.ContentText = await textEngine.ExtractText(diskFile.FullName, context);
                }
            }, parallelism, true);

            // Now convert the extracted texts back to the original repository files
            pdfManipulator.TransferExtractedText(textExtractionFiles, files);

            sw.Stop();
            Log.Information("Finished parallel ocr extraction for-each-loop with parallelism of {parallelism} for {Count} files of archiveRecordId {archiveRecord} in {TotalSeconds}",
                            parallelism, files.Count, context.ArchiveRecordId, sw.Elapsed.TotalSeconds);
            Log.Debug(JsonConvert.SerializeObject(files));
        }