protected async override Task <ProcessResult> OnPolling(PollerJobParameters parameters, string workingFolder) { //if this handle already has rasterImage we do not need to do anything //we can skip only if this action is not forced or if the pdf was not generated by office //because office file can be re-generated due to errors in conversoin (libreoffice or MsOffice) if (!IsForced(parameters) && !FromPipelineId(parameters, "office")) { var formats = GetFormats(parameters.TenantId, parameters.JobId); if (formats.Any(f => f == DocumentFormats.RasterImage)) { return(ProcessResult.Ok); } } String format = parameters.All[JobKeys.ThumbnailFormat]; Logger.DebugFormat("Conversion for jobId {0} in format {1} starting", parameters.JobId, format); var task = _taskFactory(); var passwords = ClientPasswordSet.GetPasswordFor(parameters.FileName); foreach (var password in passwords) { task.Passwords.Add(password); } string pathToFile = await DownloadBlob(parameters.TenantId, parameters.JobId, parameters.FileName, workingFolder).ConfigureAwait(false); var convertParams = new CreatePdfImageTaskParams() { Dpi = parameters.GetIntOrDefault(JobKeys.Dpi, 150), FromPage = parameters.GetIntOrDefault(JobKeys.PagesFrom, 1), Pages = parameters.GetIntOrDefault(JobKeys.PagesCount, 1), Format = (CreatePdfImageTaskParams.ImageFormat)Enum.Parse(typeof(CreatePdfImageTaskParams.ImageFormat), format, true) }; await task.Run( pathToFile, convertParams, (i, s) => Write(workingFolder, parameters, format, i, s) //Currying ).ConfigureAwait(false); Logger.DebugFormat("Conversion of {0} in format {1} done", parameters.JobId, format); return(ProcessResult.Ok); }
protected async override Task <ProcessResult> OnPolling( PollerJobParameters parameters, String workingFolder) { Boolean result; var contentFileName = Path.ChangeExtension(parameters.FileName, ".content"); if (!_formats.Contains(parameters.FileExtension)) { Logger.DebugFormat("Document for job Id {0} has an extension not supported, setting null content", parameters.JobId); return(new ProcessResult(await AddNullContentFormat(parameters, contentFileName))); } Logger.DebugFormat("Starting tika on job: {0}, file extension {1}", parameters.JobId, parameters.FileExtension); Logger.DebugFormat("Downloading blob for job: {0}, on local path {1}", parameters.JobId, workingFolder); string pathToFile = await DownloadBlob(parameters.TenantId, parameters.JobId, parameters.FileName, workingFolder); pathToFile = ProcessFile(pathToFile, workingFolder); Boolean shouldAnalyze = _filterManager.ShouldAnalyze(parameters.FileName, pathToFile); if (!shouldAnalyze) { Logger.InfoFormat("File {0} for job {1} was discharded!", parameters.FileName, parameters.JobId); return(new ProcessResult(await AddNullContentFormat(parameters, contentFileName))); } Logger.DebugFormat("Search for password JobId:{0}", parameters.JobId); var passwords = ClientPasswordSet.GetPasswordFor(parameters.FileName).ToArray(); String content = ""; Int32 analyzerOrdinal = 0; Boolean success = false; var analyzer = BuildAnalyzer(analyzerOrdinal); do { try { if (passwords.Any()) { //Try with all the password foreach (var password in passwords) { try { content = analyzer.GetHtmlContent(pathToFile, password) ?? ""; break; //first password that can decrypt file break the list of password to try } catch (Exception) { Logger.ErrorFormat("Error opening file {0} with password", parameters.FileName); } } } else { //Simply analyze file without password Logger.DebugFormat("Analyze content JobId: {0} -> Path: {1}", parameters.JobId, pathToFile); content = analyzer.GetHtmlContent(pathToFile, "") ?? ""; } success = true; } catch (Exception ex) { Logger.ErrorFormat(ex, "Error extracting tika with analyzer {0} on file {1}", analyzer.Describe(), parameters.FileName, parameters.JobId); analyzer = BuildAnalyzer(++analyzerOrdinal); if (analyzer != null) { Logger.InfoFormat("Retry job {0} with analyzer {1}", parameters.JobId, analyzer.Describe()); } } } while (analyzer != null && success == false); Logger.DebugFormat("Finished tika on job: {0}, charsNum {1}", parameters.JobId, content.Count()); String sanitizedContent = content; if (!string.IsNullOrWhiteSpace(content)) { var resultContent = _builder.CreateFromTikaPlain(content); var documentContent = resultContent.Content; sanitizedContent = resultContent.SanitizedTikaContent; var pages = documentContent.Pages.Count(); string lang = null; if (pages > 1) { lang = LanguageDetector.GetLanguage(documentContent.Pages[1].Content); } if (lang == null && pages == 1) { lang = LanguageDetector.GetLanguage(documentContent.Pages[0].Content); } if (lang != null) { documentContent.AddMetadata(DocumentContent.MedatataLanguage, lang); } result = await AddFormatToDocumentFromObject( parameters.TenantId, this.QueueName, parameters.JobId, new DocumentFormat(DocumentFormats.Content), documentContent, contentFileName, new Dictionary <string, object>()); Logger.DebugFormat("Added format {0} to jobId {1}, result: {2}", DocumentFormats.Content, parameters.JobId, result); } var tikaFileName = Path.Combine(workingFolder, Path.GetFileNameWithoutExtension(parameters.FileName) + ".tika.html"); tikaFileName = SanitizeFileNameForLength(tikaFileName); File.WriteAllText(tikaFileName, sanitizedContent); result = await AddFormatToDocumentFromFile( parameters.TenantId, parameters.JobId, new DocumentFormat(DocumentFormats.Tika), tikaFileName, new Dictionary <string, object>()); Logger.DebugFormat("Added format {0} to jobId {1}, result: {2}", DocumentFormats.Tika, parameters.JobId, result); return(ProcessResult.Ok); }